IVGCVSW-798 Add Softmax NEON support for QASYMM8

Change-Id: I4f2cca52caf210fdb7d6bb7e9436ac51cb5088b4 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112398 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
author: Diego Lopez Recas <Diego.LopezRecas@arm.com> 2017-12-04 18:56:10 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:45:00 +0000
commit: 35ceeb2199c569810a1524a0a21c2df2a3f5f29e (patch)
tree: 4a55f8626cb2960843547fabdb2431a70ec1029a
parent: 97cf2497d2b617de3209330893ad51bd0cc126ce (diff)
download: ComputeLibrary-35ceeb2199c569810a1524a0a21c2df2a3f5f29e.tar.gz
28 files changed, 939 insertions, 886 deletions
diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
index 0a3344b115..0003bb26cd 100644
--- a/arm_compute/core/AccessWindowAutoPadding.h
+++ b/arm_compute/core/AccessWindowAutoPadding.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@ public:
 
     // Inherited methods overridden:
     bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
 
 private:
diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
index 6dcba072c4..a0ceeda273 100644
--- a/arm_compute/core/AccessWindowStatic.h
+++ b/arm_compute/core/AccessWindowStatic.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,7 +79,7 @@ public:
 
     // Inherited methods overriden:
     bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
 
     ITensorInfo *_info;
diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h
index 102860f9d8..4e59e58dce 100644
--- a/arm_compute/core/AccessWindowTranspose.h
+++ b/arm_compute/core/AccessWindowTranspose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ class AccessWindowTranspose : public AccessWindowRectangle
 public:
     using AccessWindowRectangle::AccessWindowRectangle;
     bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
     using AccessWindowRectangle::compute_valid_region;
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
 };
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index 97dbba3fab..56c7ccdd93 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,7 +86,7 @@ public:
         return _error_description;
     }
     /** Throws a runtime exception in case it contains a valid error status */
-    void throw_if_error()
+    void throw_if_error() const
     {
         if(!bool(*this))
         {
@@ -96,7 +96,7 @@ public:
 
 private:
     /** Internal throwing function */
-    [[noreturn]] void internal_throw_on_error();
+    [[noreturn]] void internal_throw_on_error() const;
 
 private:
     ErrorCode   _code;
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
index 5ea0f6c825..9c7e35ab16 100644
--- a/arm_compute/core/FixedPoint.inl
+++ b/arm_compute/core/FixedPoint.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,27 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/utility.h"
 
 #include <cmath>
 #include <limits>
 
-namespace
-{
-template <typename TpIn, typename TpSat>
-inline TpSat saturate_convert(TpIn a)
-{
-    if(a > std::numeric_limits<TpSat>::max())
-    {
-        a = std::numeric_limits<TpSat>::max();
-    }
-    if(a < std::numeric_limits<TpSat>::min())
-    {
-        a = std::numeric_limits<TpSat>::min();
-    }
-    return static_cast<TpSat>(a);
-}
-} // namespace
-
 namespace arm_compute
 {
 inline qint8_t sqshl_qs8(qint8_t a, int shift)
@@ -50,7 +34,7 @@ inline qint8_t sqshl_qs8(qint8_t a, int shift)
     qint16_t tmp = static_cast<qint16_t>(a) << shift;
 
     // Saturate the result in case of overflow and cast to qint8_t
-    return saturate_convert<qint16_t, qint8_t>(tmp);
+    return utility::saturate_cast<qint8_t>(tmp);
 }
 
 inline qint16_t sqshl_qs16(qint16_t a, int shift)
@@ -58,7 +42,7 @@ inline qint16_t sqshl_qs16(qint16_t a, int shift)
     qint32_t tmp = static_cast<qint32_t>(a) << shift;
 
     // Saturate the result in case of overflow and cast to qint16_t
-    return saturate_convert<qint32_t, qint16_t>(tmp);
+    return utility::saturate_cast<qint16_t>(tmp);
 }
 
 inline qint8_t sshr_qs8(qint8_t a, int shift)
@@ -101,7 +85,7 @@ inline qint8_t sqadd_qs8(qint8_t a, qint8_t b)
     qint16_t tmp = (static_cast<qint16_t>(a) + static_cast<qint16_t>(b));
 
     // Saturate the result in case of overflow and cast to qint8_t
-    return saturate_convert<qint16_t, qint8_t>(tmp);
+    return utility::saturate_cast<qint8_t>(tmp);
 }
 
 inline qint16_t sqadd_qs16(qint16_t a, qint16_t b)
@@ -110,7 +94,7 @@ inline qint16_t sqadd_qs16(qint16_t a, qint16_t b)
     qint32_t tmp = (static_cast<qint32_t>(a) + static_cast<qint32_t>(b));
 
     // Saturate the result in case of overflow and cast to qint16_t
-    return saturate_convert<qint32_t, qint16_t>(tmp);
+    return utility::saturate_cast<qint16_t>(tmp);
 }
 
 inline qint32_t sqadd_qs32(qint32_t a, qint32_t b)
@@ -119,7 +103,7 @@ inline qint32_t sqadd_qs32(qint32_t a, qint32_t b)
     qint64_t tmp = (static_cast<qint64_t>(a) + static_cast<qint64_t>(b));
 
     // Saturate the result in case of overflow and cast to qint32_t
-    return saturate_convert<qint64_t, qint32_t>(tmp);
+    return utility::saturate_cast<qint32_t>(tmp);
 }
 
 inline qint8_t ssub_qs8(qint8_t a, qint8_t b)
@@ -138,7 +122,7 @@ inline qint8_t sqsub_qs8(qint8_t a, qint8_t b)
     qint16_t tmp = static_cast<qint16_t>(a) - static_cast<qint16_t>(b);
 
     // Saturate the result in case of overflow and cast to qint8_t
-    return saturate_convert<qint16_t, qint8_t>(tmp);
+    return utility::saturate_cast<qint8_t>(tmp);
 }
 
 inline qint16_t sqsub_qs16(qint16_t a, qint16_t b)
@@ -147,7 +131,7 @@ inline qint16_t sqsub_qs16(qint16_t a, qint16_t b)
     qint32_t tmp = static_cast<qint32_t>(a) - static_cast<qint32_t>(b);
 
     // Saturate the result in case of overflow and cast to qint16_t
-    return saturate_convert<qint32_t, qint16_t>(tmp);
+    return utility::saturate_cast<qint16_t>(tmp);
 }
 
 inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
@@ -183,7 +167,7 @@ inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
     // Rounding up
     tmp += round_up_const;
 
-    return saturate_convert<qint16_t, qint8_t>(tmp >> fixed_point_position);
+    return utility::saturate_cast<qint8_t>(tmp >> fixed_point_position);
 }
 
 inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
@@ -195,7 +179,7 @@ inline qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
     // Rounding up
     tmp += round_up_const;
 
-    return saturate_convert<qint32_t, qint16_t>(tmp >> fixed_point_position);
+    return utility::saturate_cast<qint16_t>(tmp >> fixed_point_position);
 }
 
 inline qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position)
@@ -394,7 +378,7 @@ inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
 inline qint8_t sqcvt_qs8_f32(float a, int fixed_point_position)
 {
     // round_nearest_integer(a * 2^(fixed_point_position))
-    return saturate_convert<float, qint8_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
+    return utility::saturate_cast<qint8_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
 }
 
 inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
@@ -405,18 +389,18 @@ inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
 inline qint16_t sqcvt_qs16_f32(float a, int fixed_point_position)
 {
     // round_nearest_integer(a * 2^(fixed_point_position))
-    return saturate_convert<float, qint16_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
+    return utility::saturate_cast<qint16_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
 }
 
 inline qint8_t sqmovn_qs16(qint16_t a)
 {
     // Saturate the result in case of overflow and cast to qint8_t
-    return saturate_convert<qint16_t, qint8_t>(a);
+    return utility::saturate_cast<qint8_t>(a);
 }
 
 inline qint16_t sqmovn_qs32(qint32_t a)
 {
     // Saturate the result in case of overflow and cast to qint16_t
-    return saturate_convert<qint32_t, qint16_t>(a);
+    return utility::saturate_cast<qint16_t>(a);
 }
 }
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index e01e4baa6b..c6a7db4f96 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -350,7 +350,7 @@ bool update_window_and_padding(Window &win, Ts &&... patterns)
 
     bool padding_changed = false;
 
-    utility::for_each([&](const IAccessWindow & w)
+    utility::for_each([&](IAccessWindow & w)
     {
         padding_changed |= w.update_padding_if_needed(win);
     },
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index cf7490d53e..583041a48b 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,7 +85,7 @@ public:
      *
      * @return True if the padding has been changed.
      */
-    virtual bool update_padding_if_needed(const Window &window) const = 0;
+    virtual bool update_padding_if_needed(const Window &window) = 0;
     /** Compute the valid region based on access pattern and valid region of the inputs.
      *
      * @note This method assumes that there is no border.
@@ -168,7 +168,7 @@ public:
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
 
     bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) const override;
+    bool update_padding_if_needed(const Window &window) override;
 
 protected:
     ITensorInfo *_info;
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index 9a67712f3d..9112f3ea18 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,7 +79,7 @@ public:
      *
      * @return Reference to this ITensorInfo object
      */
-    virtual ITensorInfo &set_tensor_shape(TensorShape shape) = 0;
+    virtual ITensorInfo &set_tensor_shape(const TensorShape &shape) = 0;
     /** Set the fixed point position to the specified value
      *
      * @warning The fixed point position must be set once the data type has been configured
@@ -95,7 +95,7 @@ public:
      *
      * @return Reference to this ITensorInfo object
      */
-    virtual ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) = 0;
+    virtual ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) = 0;
     /** Resets the padding settings of the tensor.
     *
     * @return Reference to this ITensorInfo object
@@ -214,7 +214,7 @@ public:
      *
      * @param[in] valid_region Valid region to set.
      */
-    virtual void set_valid_region(ValidRegion valid_region) = 0;
+    virtual void set_valid_region(const ValidRegion &valid_region) = 0;
 
     /** Get the quantization settings (scale and offset) of the tensor.
     *
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index bd0e642d76..c30a4cd23d 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -43,13 +43,13 @@ public:
     NELogits1DMaxKernel();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
      *
-     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
      * @param[in] output Destination tensor. Data types supported: same as @p input
      *
      * @return a status
@@ -61,117 +61,71 @@ public:
     BorderSize border_size() const override;
 
 private:
-    using Logits1DMaxFunction = void(const ITensor *in, ITensor *out, const Window &window);
+    using Logits1DMaxFunction = void(const ITensor &in, ITensor &out, const Window &window);
 
 private:
     Logits1DMaxFunction *_func;
     BorderSize           _border_size;
 };
 
-/** Interface for shifting the logits values around the max value and exponentiating the result */
-class NELogits1DShiftExpSumKernel : public INEKernel
+/** Interface for softmax computation for QASYMM8 with pre-computed max. */
+class NELogits1DSoftmaxKernel : public INEKernel
 {
 public:
     const char *name() const override
     {
-        return "NELogits1DShiftExpSumKernel";
+        return "NELogits1DSoftmaxKernel";
     }
     /** Default constructor */
-    NELogits1DShiftExpSumKernel();
+    NELogits1DSoftmaxKernel();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DShiftExpSumKernel(const NELogits1DShiftExpSumKernel &) = delete;
+    NELogits1DSoftmaxKernel(const NELogits1DSoftmaxKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DShiftExpSumKernel &operator=(const NELogits1DShiftExpSumKernel &) = delete;
+    NELogits1DSoftmaxKernel &operator=(const NELogits1DSoftmaxKernel &) = delete;
     /** Allow instances of this class to be moved */
-    NELogits1DShiftExpSumKernel(NELogits1DShiftExpSumKernel &&) = default;
+    NELogits1DSoftmaxKernel(NELogits1DSoftmaxKernel &&) = default;
     /** Allow instances of this class to be moved */
-    NELogits1DShiftExpSumKernel &operator=(NELogits1DShiftExpSumKernel &&) = default;
+    NELogits1DSoftmaxKernel &operator=(NELogits1DSoftmaxKernel &&) = default;
     /** Default destructor */
-    ~NELogits1DShiftExpSumKernel() = default;
+    ~NELogits1DSoftmaxKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32.
-     * @param[in]  max    Max values tensor. Data types supported: same as @p input.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in]  max    Max values tensor. Same shape as input with dimension 0 set to 1.
+     *                    Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
-     */
-    void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta = 1.0f);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: same as @p input.
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input.
-     * @param[in] beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta = 1.0f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta);
-
-private:
-    Logits1DShiftExpSumFunction *_func;
-    const ITensor               *_input;
-    const ITensor               *_max;
-    ITensor                     *_output;
-    ITensor                     *_sum;
-    float                        _beta;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class NELogits1DNormKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELogits1DNormKernel";
-    }
-    /** Default constructor */
-    NELogits1DNormKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DNormKernel(const NELogits1DNormKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DNormKernel &operator=(const NELogits1DNormKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELogits1DNormKernel(NELogits1DNormKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELogits1DNormKernel &operator=(NELogits1DNormKernel &&) = default;
-    /** Default destructor */
-    ~NELogits1DNormKernel() = default;
-    /** Set the input and output tensors.
+     * @param[in]  beta   A scaling factor for the exponent.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32.
-     * @param[in]  sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
+     * @param      tmp    Auxiliary tensor. Must be type F32 and same shape as the input.
      */
-    void configure(const ITensor *input, const ITensor *sum, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DNormKernel
+    void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
      *
-     * @param[in] input  Source tensor. Data types supported: QS8/QS16/S32/F16/F32
-     * @param[in] sum    Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input.
-     * @param[in] output Destination tensor. Data types supported: same as @p input.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in] max    Max values tensor info. Same shape as input with dimension 0 set to 1.
+     *                   Data types supported: same as @p input.
+     * @param[in] output Destination tensor info. Data types supported: same as @p input.
+     * @param[in] beta   A scaling factor for the exponent.
+     * @param[in] tmp    Tensor info of auxiliary. Must be type F32 and same shape as the input.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *max,
+                           const ITensorInfo *output, const float beta, const ITensorInfo *tmp);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    using Logits1DNormFunction = void(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window);
+    using LogitsSoftmaxFunction = void(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta,
+                                       const Window &window);
 
-private:
-    Logits1DNormFunction *_func;
-    const ITensor        *_input;
-    const ITensor        *_sum;
-    ITensor              *_output;
+    LogitsSoftmaxFunction *_func;
+    const ITensor         *_input;
+    const ITensor         *_max;
+    ITensor               *_output;
+    float                  _beta;
+    ITensor               *_tmp; //Temporary. Used internally
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 67574f1326..7f4239d49b 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -98,8 +98,8 @@ public:
         _parent->set_fixed_point_position(fixed_point_position);
         return *this;
     };
-    ITensorInfo &set_tensor_shape(TensorShape shape) override;
-    ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) override
+    ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_quantization_info(quantization_info);
@@ -196,7 +196,7 @@ public:
     {
         return _valid_region;
     }
-    void set_valid_region(ValidRegion valid_region) override
+    void set_valid_region(const ValidRegion &valid_region) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         // Check if subtensor is valid if parent is configured
@@ -204,7 +204,7 @@ public:
         {
             ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
         }
-        _valid_region = std::move(valid_region);
+        _valid_region = valid_region;
     }
     QuantizationInfo quantization_info() const override
     {
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 80ef7f8d5a..0b8989f942 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -217,9 +217,9 @@ public:
     ITensorInfo &set_data_type(DataType data_type) override;
     ITensorInfo &set_num_channels(int num_channels) override;
     ITensorInfo &set_format(Format format) override;
-    ITensorInfo &set_tensor_shape(TensorShape shape) override;
+    ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
     ITensorInfo &set_fixed_point_position(int fixed_point_position) override;
-    ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) override;
+    ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override;
     ITensorInfo &reset_padding() override;
     bool         auto_padding() override;
     bool extend_padding(const PaddingSize &padding) override;
@@ -289,9 +289,9 @@ public:
     {
         return _valid_region;
     }
-    void set_valid_region(ValidRegion valid_region) override
+    void set_valid_region(const ValidRegion &valid_region) override
     {
-        _valid_region = std::move(valid_region);
+        _valid_region = valid_region;
     }
     QuantizationInfo quantization_info() const override
     {
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index ad102607e8..50f1211c18 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,26 +70,30 @@ public:
      *
      * @param[in] dimension Dimension for which the value is set.
      * @param[in] value     Value to be set for the dimension.
+     *
+     * @return *this.
      */
-    void set(size_t dimension, size_t value)
+    TensorShape &set(size_t dimension, size_t value)
     {
         // Clear entire shape if one dimension is zero
         if(value == 0)
         {
             _num_dimensions = 0;
             std::fill(_id.begin(), _id.end(), 0);
-            return;
         }
+        else
+        {
+            // Make sure all empty dimensions are filled with 1
+            std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
 
-        // Make sure all empty dimensions are filled with 1
-        std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
-
-        // Set the specified dimension and increase the number of dimensions if
-        // necessary
-        Dimensions::set(dimension, value);
+            // Set the specified dimension and increase the number of dimensions if
+            // necessary
+            Dimensions::set(dimension, value);
 
-        // Correct number dimensions to ignore trailing dimensions of size 1
-        apply_dimension_correction();
+            // Correct number dimensions to ignore trailing dimensions of size 1
+            apply_dimension_correction();
+        }
+        return *this;
     }
 
     /** Accessor to remove the dimension n from the tensor shape.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 5197000bf9..aa415acebe 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -190,6 +190,21 @@ struct ValidRegion
         return anchor[d] + shape[d];
     }
 
+    /** Accessor to set the value of anchor and shape for one of the dimensions.
+     *
+     * @param[in] dimension Dimension for which the value is set.
+     * @param[in] start     Value to be set in anchor for the dimension.
+     * @param[in] size      Value to be set in shape for the dimension.
+     *
+     * @return *this.
+     */
+    ValidRegion &set(size_t dimension, int start, size_t size)
+    {
+        anchor.set(dimension, start);
+        shape.set(dimension, size);
+        return *this;
+    }
+
     Coordinates anchor;
     TensorShape shape;
 };
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 51967b1762..fc89d97073 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -40,12 +40,19 @@
 
 namespace arm_compute
 {
+/** Calculate the rounded up quotient of val / m. */
+template <typename S, typename T>
+constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
+{
+    return (val + m - 1) / m;
+}
+
 /** Computes the smallest number larger or equal to value that is a multiple of divisor. */
 template <typename S, typename T>
 inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
 {
     ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
-    return ((value + divisor - 1) / divisor) * divisor;
+    return DIV_CEIL(value, divisor) * divisor;
 }
 
 /** Computes the largest number smaller or equal to value that is a multiple of divisor. */
@@ -56,13 +63,6 @@ inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor)
     return (value / divisor) * divisor;
 }
 
-/** Calculate the rounded up quotient of val / m. */
-template <typename S, typename T>
-constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
-{
-    return (val + m - 1) / m;
-}
-
 /** Returns the arm_compute library build information
  *
  * Contains the version number and the build options used to build the library
diff --git a/arm_compute/core/utils/misc/utility.h b/arm_compute/core/utils/misc/utility.h
index 45b3b5268e..e8d823b5bc 100644
--- a/arm_compute/core/utils/misc/utility.h
+++ b/arm_compute/core/utils/misc/utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_MISC_UTILITY_H__
 
 #include <array>
+#include <limits>
 
 namespace arm_compute
 {
@@ -123,6 +124,22 @@ inline auto foldl(F &&func, T &&initial, U &&value, Us &&... values) -> decltype
 {
     return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)), std::forward<Us>(values)...);
 }
+
+/** Type cast with saturation.
+ *
+ * @param[in] val Value of type U to cast.
+ *
+ * @return Original value clamped to numeric limits of T and converted to type T.
+ *
+ * @warning Numeric limits of T must be representable without loss in type U.
+ */
+template <typename T, typename U>
+T saturate_cast(U val)
+{
+    const auto low  = static_cast<U>(std::numeric_limits<T>::lowest());
+    const auto high = static_cast<U>(std::numeric_limits<T>::max());
+    return static_cast<T>(clamp(val, low, high));
+}
 } // namespace utility
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_MISC_UTILITY_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 5043f79c23..3d981b6f75 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,9 +40,9 @@ class ITensor;
  * @f[ out = \frac{e^{x - max(x)}}{\sum{e^{x - max(x)}}} @f]
  *
  * This function runs the following kernels:
+ * -# @ref NEFillBorderKernel
  * -# @ref NELogits1DMaxKernel
- * -# @ref NELogits1DShiftExpSumKernel
- * -# @ref NELogits1DNormKernel
+ * -# @ref NELogits1DSoftmaxKernel
  */
 class NESoftmaxLayer : public IFunction
 {
@@ -51,14 +51,16 @@ public:
     NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
+     * @param[in,out] input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32. If the width is not a
+     *                       multiple of the internal processing block size, @ref NEFillBorderKernel replicates the
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    output Destination tensor. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
      */
     void configure(ITensor *input, ITensor *output, float beta = 1.0f);
     /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
      *
-     * @param[in] input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1.
      *
@@ -70,14 +72,12 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                 _memory_group;
-    NELogits1DMaxKernel         _max_kernel;
-    NELogits1DShiftExpSumKernel _shift_exp_sum_kernel;
-    NELogits1DNormKernel        _norm_kernel;
-    NEFillBorderKernel          _fill_border_kernel;
-    Tensor                      _max;
-    Tensor                      _sum;
-    Tensor                      _tmp;
+    MemoryGroup             _memory_group;
+    NELogits1DMaxKernel     _max_kernel;
+    NELogits1DSoftmaxKernel _softmax_kernel;
+    NEFillBorderKernel      _fill_border_kernel;
+    Tensor                  _max;
+    Tensor                  _tmp;
 };
 }
 #endif /* __ARM_COMPUTE_NESOFTMAXLAYER_H__ */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index fa6c22713f..787b38dde0 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -360,8 +360,8 @@ v17.04 Public bug fixes release
  -  @ref arm_compute::NEHarrisScoreKernel
  -  @ref arm_compute::NEHOGDetectorKernel
  -  @ref arm_compute::NELogits1DMaxKernel
- -  @ref arm_compute::NELogits1DShiftExpSumKernel
- -  @ref arm_compute::NELogits1DNormKernel
+ -  arm_compute::NELogits1DShiftExpSumKernel
+ -  arm_compute::NELogits1DNormKernel
  -  @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel
  -  @ref arm_compute::NENonMaximaSuppression3x3Kernel
 
@@ -374,7 +374,7 @@ v17.03.1 First Major public release of the sources
  - New NEON kernels / functions:
    - @ref arm_compute::NENormalizationLayerKernel / @ref arm_compute::NENormalizationLayer
    - @ref arm_compute::NETransposeKernel / @ref arm_compute::NETranspose
-   - @ref arm_compute::NELogits1DMaxKernel, @ref arm_compute::NELogits1DShiftExpSumKernel, @ref arm_compute::NELogits1DNormKernel / @ref arm_compute::NESoftmaxLayer
+   - @ref arm_compute::NELogits1DMaxKernel, arm_compute::NELogits1DShiftExpSumKernel, arm_compute::NELogits1DNormKernel / @ref arm_compute::NESoftmaxLayer
    - @ref arm_compute::NEIm2ColKernel, @ref arm_compute::NECol2ImKernel, arm_compute::NEConvolutionLayerWeightsReshapeKernel / @ref arm_compute::NEConvolutionLayer
    - @ref arm_compute::NEGEMMMatrixAccumulateBiasesKernel / @ref arm_compute::NEFullyConnectedLayer
    - @ref arm_compute::NEGEMMLowpMatrixMultiplyKernel / arm_compute::NEGEMMLowp
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index c627e03c42..a250b519b9 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -83,7 +83,7 @@ def filter_clang_tidy_lines( lines ):
                ("ReferenceCPP.cpp" in line and re.search(r"parameter '[^']+' is unused", line)) or
                ("NEGEMMMatrixMultiplyKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
                ("NEPoolingLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
-               ("NESoftmaxLayerKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or
+               ("NESoftmaxLayerKernel.cpp" in line and "macro argument should be enclosed in parentheses" in line) or
                ("GraphUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint32'" in line) or
                ("GraphUtils.cpp" in line and "consider replacing 'unsigned long' with 'uint64'" in line) or
                ("ConvolutionLayer.cpp" in line and "move assignment operators should be marked noexcept" in line) or
diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index b75ebcfeb8..74af99bbb9 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,7 +70,7 @@ bool AccessWindowAutoPadding::update_window_if_needed(Window &window) const
     return false;
 }
 
-bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window) const
+bool AccessWindowAutoPadding::update_padding_if_needed(const Window &window)
 {
     ARM_COMPUTE_UNUSED(window);
 
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 81ad60bc4c..2ddd59ed4d 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -163,7 +163,7 @@ bool AccessWindowStatic::update_window_if_needed(Window &window) const
     return window_modified;
 }
 
-bool AccessWindowStatic::update_padding_if_needed(const Window &window) const
+bool AccessWindowStatic::update_padding_if_needed(const Window &window)
 {
     ARM_COMPUTE_UNUSED(window);
 
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 4506a0b44c..3c45ab3571 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -180,7 +180,7 @@ bool AccessWindowTranspose::update_window_if_needed(Window &window) const
     return window_modified;
 }
 
-bool AccessWindowTranspose::update_padding_if_needed(const Window &window) const
+bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
     if(_info == nullptr || !_info->is_resizable())
diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index b593c27a7f..2f6a94bb85 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@ void arm_compute::error(const char *function, const char *file, const int line,
     va_end(args);
     throw std::runtime_error(err.error_description());
 }
-void Status::internal_throw_on_error()
+void Status::internal_throw_on_error() const
 {
     throw std::runtime_error(_error_description);
 }
diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 693d851a5d..7dfe5db5c5 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,9 +122,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
         if(min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
-            const int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
+            int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
+            start     = std::min<int>(start / _scale_y, window.y().end());
 
-            window.set(1, Window::Dimension(start / _scale_y, window.y().end(), window.y().step()));
+            window.set(1, Window::Dimension(start, window.y().end(), window.y().step()));
             window_modified = true;
         }
 
@@ -143,8 +144,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
         if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
-            window.set(1, Window::Dimension(window.y().start(), end / _scale_y, window.y().step()));
+            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
+            end     = std::max<int>(window.y().start(), end / _scale_y);
+
+            window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
             window_modified = true;
         }
     }
@@ -164,8 +167,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
         if(min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
-            const int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
-            window.set(0, Window::Dimension(start / _scale_x, window.x().end(), window.x().step()));
+            int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
+            start     = std::min<int>(start / _scale_x, window.x().end());
+
+            window.set(0, Window::Dimension(start, window.x().end(), window.x().step()));
             window_modified = true;
         }
 
@@ -181,8 +186,10 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
         if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
-            window.set(0, Window::Dimension(window.x().start(), end / _scale_x, window.x().step()));
+            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
+            end     = std::max<int>(window.x().start(), end / _scale_x);
+
+            window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
             window_modified = true;
         }
     }
@@ -192,7 +199,7 @@ bool AccessWindowRectangle::update_window_if_needed(Window &window) const
     return window_modified;
 }
 
-bool AccessWindowRectangle::update_padding_if_needed(const Window &window) const
+bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
     if(_info == nullptr || !_info->is_resizable())
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index b13fb0e87c..13d87a0989 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,285 +33,433 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/utility.h"
 
 #include <algorithm>
 #include <arm_neon.h>
 #include <cfloat>
+#include <functional>
 
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments_logits_1d_max(const ITensorInfo *input, const ITensorInfo *output)
+namespace arm_compute
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+template <typename T, int N>
+struct vec_n_type;
 
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        // Softmax across the x dimension
-        TensorShape output_shape{ input->tensor_shape() };
-        output_shape.set(0, 1);
+#define DECLARE_NEON_VEC_TYPE(T, N, V) \
+    template <>                        \
+    struct vec_n_type<T, N>            \
+    {                                  \
+        using type = V;                \
+    };
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-    }
+DECLARE_NEON_VEC_TYPE(uint8_t, 16, uint8x16_t)
+DECLARE_NEON_VEC_TYPE(uint8_t, 8, uint8x8_t)
 
-    return Status{};
-}
+DECLARE_NEON_VEC_TYPE(int8_t, 16, int8x16_t)
+DECLARE_NEON_VEC_TYPE(int8_t, 8, int8x8_t)
 
-std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInfo *input, ITensorInfo *output)
-{
-    // Configure kernel window
-    constexpr unsigned int num_elems_written_per_row = 1;
-    const int              input_width               = input->valid_region().shape.x();
+DECLARE_NEON_VEC_TYPE(uint16_t, 8, uint16x8_t)
+DECLARE_NEON_VEC_TYPE(uint16_t, 4, uint16x4_t)
 
-    unsigned int           num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-    Window                 win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = false;
+DECLARE_NEON_VEC_TYPE(int16_t, 8, int16x8_t)
+DECLARE_NEON_VEC_TYPE(int16_t, 4, int16x4_t)
 
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_row, 1.f / input_width);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access);
-    }
+DECLARE_NEON_VEC_TYPE(int32_t, 4, int32x4_t)
+DECLARE_NEON_VEC_TYPE(int32_t, 2, int32x2_t)
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
+DECLARE_NEON_VEC_TYPE(uint32_t, 4, uint32x4_t)
+DECLARE_NEON_VEC_TYPE(uint32_t, 2, uint32x2_t)
 
-Status validate_arguments_logits_1d_shift_exp_sum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, max, sum, output);
-    ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+DECLARE_NEON_VEC_TYPE(float16_t, 8, float16x8_t)
+DECLARE_NEON_VEC_TYPE(float16_t, 4, float16x4_t)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    }
+DECLARE_NEON_VEC_TYPE(float, 4, float32x4_t)
+DECLARE_NEON_VEC_TYPE(float, 2, float32x2_t)
 
-    // Checks performed when sum is configured
-    if(sum->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max, sum);
-    }
+template <typename T, int N>
+using vec_n_t = typename vec_n_type<T, N>::type;
 
-    return Status{};
-}
+template <typename T, int N>
+using vec_n_byte_t = vec_n_t < T, N / sizeof(T) >;
+
+template <typename T>
+using vec_16_byte_t = vec_n_byte_t<T, 16>;
+
+template <typename T>
+using vec_8_byte_t = vec_n_byte_t<T, 8>;
+
+template <typename T>
+using const_ptr_t = const T *;
 
-std::pair<Status, Window> validate_and_configure_window_logits_1d_shift_exp_sum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
+template <typename T>
+using ptr_t = T *;
+
+#define FORWARD_DECLARE_VGET_LANE_FOR_TYPE(TYPE) \
+    template <int lane>                          \
+    TYPE vget_lane(vec_8_byte_t<TYPE> vec);      \
+    template <int lane>                          \
+    TYPE vget_lane(vec_16_byte_t<TYPE> vec);
+
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint8_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int8_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint16_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int16_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(uint32_t)
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(int32_t)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(float16_t)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+FORWARD_DECLARE_VGET_LANE_FOR_TYPE(float)
+template <int lane>
+float vget_lane(float32x4x4_t vec);
+
+template <typename V>
+using elem_type_t = decltype(vget_lane<0>(std::declval<V>()));
+
+template <typename V>
+constexpr size_t vec_size_of(const V &vec)
 {
-    unsigned int num_elems_processed_per_iteration = input->valid_region().shape.x();
+    return sizeof(vec) / sizeof(elem_type_t<V>);
+}
 
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max, 0, 1);
-    AccessWindowHorizontal sum_access(sum, 0, 1);
-    bool                   window_changed = false;
+template <typename V>
+V vdup_n(elem_type_t<V> val);
+template <typename V>
+V vld(const_ptr_t<elem_type_t<V>> ptr);
+
+#define DECLARE_NEON_FUNCTIONS_FOR_TYPE(TYPE, TAG)                                \
+    template <>                                                                   \
+    inline vec_8_byte_t<TYPE> vdup_n<vec_8_byte_t<TYPE>>(TYPE val)                \
+    {                                                                             \
+        return vdup_n_##TAG(val);                                                 \
+    }                                                                             \
+    template <>                                                                   \
+    inline vec_16_byte_t<TYPE> vdup_n<vec_16_byte_t<TYPE>>(TYPE val)              \
+    {                                                                             \
+        return vdupq_n_##TAG(val);                                                \
+    }                                                                             \
+    template <>                                                                   \
+    inline vec_8_byte_t<TYPE> vld<vec_8_byte_t<TYPE>>(const_ptr_t<TYPE> ptr)      \
+    {                                                                             \
+        return vld1_##TAG(ptr);                                                   \
+    }                                                                             \
+    template <>                                                                   \
+    inline vec_16_byte_t<TYPE> vld<vec_16_byte_t<TYPE>>(const_ptr_t<TYPE> ptr)    \
+    {                                                                             \
+        return vld1q_##TAG(ptr);                                                  \
+    }                                                                             \
+    inline void vst(ptr_t<TYPE> ptr, vec_8_byte_t<TYPE> vec)                      \
+    {                                                                             \
+        vst1_##TAG(ptr, vec);                                                     \
+    }                                                                             \
+    inline void vst(ptr_t<TYPE> ptr, vec_16_byte_t<TYPE> vec)                     \
+    {                                                                             \
+        vst1q_##TAG(ptr, vec);                                                    \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vmax(vec_16_byte_t<TYPE> a, vec_16_byte_t<TYPE> b) \
+    {                                                                             \
+        return vmaxq_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_8_byte_t<TYPE> vpmax(vec_8_byte_t<TYPE> a, vec_8_byte_t<TYPE> b)   \
+    {                                                                             \
+        return vpmax_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_8_byte_t<TYPE> vget_low(vec_16_byte_t<TYPE> vec)                   \
+    {                                                                             \
+        return vget_low_##TAG(vec);                                               \
+    }                                                                             \
+    inline vec_8_byte_t<TYPE> vget_high(vec_16_byte_t<TYPE> vec)                  \
+    {                                                                             \
+        return vget_high_##TAG(vec);                                              \
+    }                                                                             \
+    template <int lane>                                                           \
+    inline TYPE vget_lane(vec_8_byte_t<TYPE> vec)                                 \
+    {                                                                             \
+        static_assert(lane >= 0, "lane is out of bounds");                        \
+        static_assert(lane < vec_size_of(vec), "lane is out of bounds");          \
+        return vget_lane_##TAG(vec, lane);                                        \
+    }                                                                             \
+    template <int lane>                                                           \
+    inline TYPE vget_lane(vec_16_byte_t<TYPE> vec)                                \
+    {                                                                             \
+        static_assert(lane >= 0, "lane is out of bounds");                        \
+        static_assert(lane < vec_size_of(vec), "lane is out of bounds");          \
+        return vgetq_lane_##TAG(vec, lane);                                       \
+    }
 
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-        output_access.set_valid_region(win, input->valid_region());
+template <typename T>
+T sqadd(T a, T b);
+template <typename T>
+T sqsub(T a, T b);
+template <typename T>
+T sqmul(T a, T b, int fixed_point_position);
+
+#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU)                                        \
+    inline vec_8_byte_t<TYPET> vqsub(vec_8_byte_t<TYPET> a, vec_8_byte_t<TYPET> b)                              \
+    {                                                                                                           \
+        return vqsub_##TAGT(a, b);                                                                              \
+    }                                                                                                           \
+    inline vec_8_byte_t<TYPEU> vqadd(vec_8_byte_t<TYPEU> a, vec_8_byte_t<TYPEU> b)                              \
+    {                                                                                                           \
+        return vqadd_##TAGU(a, b);                                                                              \
+    }                                                                                                           \
+    inline vec_16_byte_t<TYPEU> vqadd(vec_16_byte_t<TYPEU> a, vec_16_byte_t<TYPEU> b)                           \
+    {                                                                                                           \
+        return vqaddq_##TAGU(a, b);                                                                             \
+    }                                                                                                           \
+    inline vec_8_byte_t<TYPET> vqexp(vec_8_byte_t<TYPET> vec, int fixed_point_position)                         \
+    {                                                                                                           \
+        return vqexp_q##TAGT(vec, fixed_point_position);                                                        \
+    }                                                                                                           \
+    inline auto vmovl(vec_8_byte_t<TYPET> vec)->decltype(vmovl_##TAGT(vec))                                     \
+    {                                                                                                           \
+        return vmovl_##TAGT(vec);                                                                               \
+    }                                                                                                           \
+    inline vec_16_byte_t<TYPET> vqrecip(vec_16_byte_t<TYPET> vec, int fixed_point_position)                     \
+    {                                                                                                           \
+        return vqrecipq_q##TAGT(vec, fixed_point_position);                                                     \
+    }                                                                                                           \
+    inline vec_16_byte_t<TYPET> vqmul(vec_16_byte_t<TYPET> a, vec_16_byte_t<TYPET> b, int fixed_point_position) \
+    {                                                                                                           \
+        return vqmulq_q##TAGT(a, b, fixed_point_position);                                                      \
+    }                                                                                                           \
+    template <>                                                                                                 \
+    inline TYPEU sqadd<TYPEU>(TYPEU a, TYPEU b)                                                                 \
+    {                                                                                                           \
+        return sqadd_q##TAGU(a, b);                                                                             \
+    }                                                                                                           \
+    inline TYPET sqexp(TYPET val, int fixed_point_position)                                                     \
+    {                                                                                                           \
+        return sqexp_q##TAGT(val, fixed_point_position);                                                        \
+    }                                                                                                           \
+    template <>                                                                                                 \
+    inline TYPET sqsub<TYPET>(TYPET a, TYPET b)                                                                 \
+    {                                                                                                           \
+        return sqsub_q##TAGT(a, b);                                                                             \
+    }                                                                                                           \
+    template <>                                                                                                 \
+    inline TYPET sqmul<TYPET>(TYPET a, TYPET b, int fixed_point_position)                                       \
+    {                                                                                                           \
+        return sqmul_q##TAGT(a, b, fixed_point_position);                                                       \
     }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, max_access, sum_access);
+
+#define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG)                               \
+    inline vec_8_byte_t<TYPE> vadd(vec_8_byte_t<TYPE> a, vec_8_byte_t<TYPE> b)    \
+    {                                                                             \
+        return vadd_##TAG(a, b);                                                  \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vadd(vec_16_byte_t<TYPE> a, vec_16_byte_t<TYPE> b) \
+    {                                                                             \
+        return vaddq_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vsub(vec_16_byte_t<TYPE> a, vec_16_byte_t<TYPE> b) \
+    {                                                                             \
+        return vsubq_##TAG(a, b);                                                 \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vexp(vec_16_byte_t<TYPE> vec)                      \
+    {                                                                             \
+        return vexpq_##TAG(vec);                                                  \
+    }                                                                             \
+    inline vec_16_byte_t<TYPE> vmul_n(vec_16_byte_t<TYPE> vec, TYPE val)          \
+    {                                                                             \
+        return vmulq_n_##TAG(vec, val);                                           \
     }
 
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint8_t, u8)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(int8_t, s8)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint16_t, u16)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(int16_t, s16)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(uint32_t, u32)
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(int32_t, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32)
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
+DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16)
+DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32)
 
-Status validate_arguments_logits_1d_norm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16)
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float, f32)
 
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    }
+template <typename VO, typename VI>
+VO vcvt(VI vec);
 
-    return Status{};
+template <>
+float32x4x4_t vcvt<float32x4x4_t>(uint8x16_t vec)
+{
+    const auto    low  = vmovl_u8(vget_low(vec));
+    const auto    high = vmovl_u8(vget_high(vec));
+    float32x4x4_t res  = { {
+            vcvtq_f32_u32(vmovl_u16(vget_low(low))),
+            vcvtq_f32_u32(vmovl_u16(vget_high(low))),
+            vcvtq_f32_u32(vmovl_u16(vget_low(high))),
+            vcvtq_f32_u32(vmovl_u16(vget_high(high)))
+        }
+    };
+    return res;
 }
 
-std::pair<Status, Window> validate_and_configure_window_logits_1d_norm(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output)
+template <>
+uint8x16_t vcvt<uint8x16_t>(float32x4x4_t vec)
 {
-    // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
-    Window       win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    uint16x8x2_t resU16 = { {
+            vcombine_u16(vqmovn_u32(vcvtq_u32_f32(vec.val[0])),
+            vqmovn_u32(vcvtq_u32_f32(vec.val[1]))),
+            vcombine_u16(vqmovn_u32(vcvtq_u32_f32(vec.val[2])),
+            vqmovn_u32(vcvtq_u32_f32(vec.val[3])))
+        }
+    };
 
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     sum_access(sum, 0, 0, 1, sum->dimension(1));
-    bool                   window_changed = false;
+    uint8x16_t res = vcombine_u8(vqmovn_u16(resU16.val[0]), vqmovn_u16(resU16.val[1]));
+    return res;
+}
 
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+float32x4x4_t vexp(float32x4x4_t vec)
+{
+    float32x4x4_t res = { {
+            vexpq_f32(vec.val[0]),
+            vexpq_f32(vec.val[1]),
+            vexpq_f32(vec.val[2]),
+            vexpq_f32(vec.val[3])
+        }
+    };
+    return res;
+}
 
-        window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
+template <>
+float32x4x4_t vdup_n<float32x4x4_t>(float val)
+{
+    float32x4x4_t res = { {
+            vdupq_n_f32(val),
+            vdupq_n_f32(val),
+            vdupq_n_f32(val),
+            vdupq_n_f32(val)
+        }
+    };
+    return res;
+}
 
-        output_access.set_valid_region(win, input->valid_region());
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, sum_access);
-    }
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+float32x4x4_t vmul_n(float32x4x4_t vec, float val)
+{
+    float32x4x4_t res = { {
+            vmulq_n_f32(vec.val[0], val),
+            vmulq_n_f32(vec.val[1], val),
+            vmulq_n_f32(vec.val[2], val),
+            vmulq_n_f32(vec.val[3], val)
+        }
+    };
+    return res;
 }
 
-void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
+float32x4x4_t vadd(float32x4x4_t a, float32x4x4_t b)
 {
-    Window in_slice = window.first_slice_window_1D();
+    float32x4x4_t res = { {
+            vaddq_f32(a.val[0], b.val[0]),
+            vaddq_f32(a.val[1], b.val[1]),
+            vaddq_f32(a.val[2], b.val[2]),
+            vaddq_f32(a.val[3], b.val[3])
+        }
+    };
+    return res;
+}
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
+namespace
+{
+Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-    do
+    // Validate in case of configured output
+    if(output.total_size() != 0)
     {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        qint8x16_t vec_max = vdupq_n_s8(std::numeric_limits<qint8_t>::lowest());
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto       in_ptr        = reinterpret_cast<const qint8_t *>(input.ptr());
-            const qint8x16_t current_value = vld1q_qs8(in_ptr);
-            vec_max                        = vmaxq_qs8(vec_max, current_value);
-        },
-        input);
-
-        qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max));
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-        carry_max           = vpmax_qs8(carry_max, carry_max);
-
-        *(reinterpret_cast<qint8_t *>(output.ptr())) = vget_lane_s8(carry_max, 0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+
+    return Status{};
 }
-void logits_1d_max_qs16(const ITensor *in, ITensor *out, const Window &window)
+
+std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInfo &input, ITensorInfo &output)
 {
-    Window in_slice = window.first_slice_window_1D();
+    // Softmax across the x dimension
+    const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1);
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info());
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
+    // Configure kernel window
+    const int input_width                       = input.valid_region().shape.x();
+    const int num_elems_processed_per_iteration = 16U / data_size_from_type(input.data_type());
+    const int num_elems_read_per_iteration      = ceil_to_multiple(input_width, num_elems_processed_per_iteration);
 
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
+    const ValidRegion out_valid_region(ValidRegion(input.valid_region()).set(0, 0, 1));
+    output.set_valid_region(out_valid_region);
 
-        qint16x8_t vec_max = vdupq_n_qs16(std::numeric_limits<qint16_t>::lowest());
+    Window win = calculate_max_window(output);
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto       in_ptr        = reinterpret_cast<const qint16_t *>(input.ptr());
-            const qint16x8_t current_value = vld1q_qs16(in_ptr);
-            vec_max                        = vmaxq_qs16(vec_max, current_value);
-        },
-        input);
+    AccessWindowHorizontal input_access(&input, input.valid_region().anchor.x(), num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, 1);
 
-        qint16x4_t carry_max = vpmax_qs16(vget_high_qs16(vec_max), vget_low_qs16(vec_max));
-        carry_max            = vpmax_qs16(carry_max, carry_max);
-        carry_max            = vpmax_qs16(carry_max, carry_max);
+    const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-        *(reinterpret_cast<qint16_t *>(output.ptr())) = vget_lane_s16(carry_max, 0);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+    const Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void logits_1d_max_f16(const ITensor *in, ITensor *out, const Window &window)
+template <typename V>
+auto reduce_max(V vec) -> elem_type_t<V>
 {
-    Window in_slice = window.first_slice_window_1D();
+    constexpr int N = vec_size_of(vec);
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
+    auto carry_max = vpmax(vget_high(vec), vget_low(vec));
 
-    do
+    for(int k = N / 2; k > 1; k /= 2)
     {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
-
-        float16x8_t vec_max = vdupq_n_f16(std::numeric_limits<float16_t>::lowest());
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto        in_ptr        = reinterpret_cast<const float16_t *>(input.ptr());
-            const float16x8_t current_value = vld1q_f16(in_ptr);
-            vec_max                         = vmaxq_f16(vec_max, current_value);
-        },
-        input);
-
-        float16x4_t carry_max = vpmax_f16(vget_high_f16(vec_max), vget_low_f16(vec_max));
-        carry_max             = vpmax_f16(carry_max, carry_max);
-        carry_max             = vpmax_f16(carry_max, carry_max);
-
-        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(carry_max, 0);
+        carry_max = vpmax(carry_max, carry_max);
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+
+    return vget_lane<0>(carry_max);
 }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
+template <typename T>
+void logits_1d_max(const ITensor &in, ITensor &out, const Window &window)
 {
-    Window in_slice = window.first_slice_window_1D();
+    const auto   start_x     = in.info()->valid_region().anchor.x();
+    const size_t input_width = in.info()->valid_region().shape.x();
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window max_slice = window_max.first_slice_window_1D();
+    Iterator input(&in, window);
+    Iterator output(&out, window);
 
-    do
+    execute_window_loop(window, [&](const Coordinates &)
     {
-        Iterator input(in, in_slice);
-        Iterator output(out, max_slice);
+        // Get pointers
+        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
 
-        float32x4_t vec_max = vdupq_n_f32(-FLT_MAX);
+        // Init max value
+        auto vec_max = vdup_n<vec_16_byte_t<T>>(std::numeric_limits<T>::lowest());
 
-        execute_window_loop(in_slice, [&](const Coordinates & id)
+        // Loop over input row
+        for(const T *it = in_ptr; it < (in_ptr + input_width); it += vec_size_of(vec_max))
         {
-            const auto        in_ptr        = reinterpret_cast<const float *>(input.ptr());
-            const float32x4_t current_value = vld1q_f32(in_ptr);
-            vec_max                         = vmaxq_f32(vec_max, current_value);
-        },
-        input);
-
-        float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max));
-        carry_max             = vpmax_f32(carry_max, carry_max);
+            const auto current_value = vld<vec_16_byte_t<T>>(it);
+            vec_max                  = vmax(vec_max, current_value);
+        }
 
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_max, 0);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+        const T max_val = reduce_max(vec_max);
+        *out_ptr        = max_val;
+    },
+    input, output);
 }
 } // namespace
 
@@ -328,54 +476,54 @@ BorderSize NELogits1DMaxKernel::border_size() const
 void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Softmax across the x dimension
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(input->info(), output->info()));
-
-    const int    input_width                       = input->info()->valid_region().shape.x();
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*input->info(), *output->info()));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_logits_1d_max(*input->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     switch(input->info()->data_type())
     {
+        case DataType::QASYMM8:
+            _func = &logits_1d_max<qasymm8_t>;
+            break;
         case DataType::QS8:
-            _func = &logits_1d_max_qs8;
+            _func = &logits_1d_max<qint8_t>;
             break;
         case DataType::QS16:
-            _func = &logits_1d_max_qs16;
+            _func = &logits_1d_max<qint16_t>;
             break;
-        case DataType::F32:
-            _func = &logits_1d_max_f32;
-            break;
-        case DataType::F16:
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &logits_1d_max_f16;
+        case DataType::F16:
+            _func = &logits_1d_max<float16_t>;
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            _func = &logits_1d_max<float>;
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
 
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0);
+    _input  = input;
+    _output = output;
+
+    const int input_width                       = input->info()->valid_region().shape.x();
+    const int num_elems_processed_per_iteration = 16U / data_size_from_type(input->info()->data_type());
+    const int num_elems_read_per_iteration      = ceil_to_multiple(input_width, num_elems_processed_per_iteration);
+
+    _border_size = BorderSize(0, num_elems_read_per_iteration - input_width, 0, 0);
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_logits_1d_max(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
 Status NELogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*input, *output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(*input->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -387,297 +535,393 @@ void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _output, window);
+    (*_func)(*_input, *_output, window);
 }
 
 namespace
 {
-void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
+Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max,
+                                         const ITensorInfo &output, const float beta, const ITensorInfo &tmp)
 {
-    ARM_COMPUTE_UNUSED(beta);
+    // Check input
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type());
 
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
+    // Check max
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max);
 
-    constexpr int step                 = 8;
-    const int     long_steps           = in->info()->valid_region().shape.x() / step;
-    const int     small_steps          = in->info()->valid_region().shape.x() % step;
-    const int     fixed_point_position = in->info()->fixed_point_position();
+    // Check output if configured
+    if(output.total_size() != 0)
+    {
+        const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info();
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization);
+    }
+
+    // Check beta
+    ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type()));
 
-    do
+    // Check tmp if configured
+    if(tmp.total_size() != 0)
     {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
+        const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type();
+        ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp);
+        // We could potentially reduce tmp memory if we could predict or make an assumption
+        // on the maximum number of threads that will run in parallel.
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp);
+    }
 
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<qint8_t *>(exp.ptr());
+    return Status{};
+}
 
-        // Init sum to zero
-        qint16x8_t vec_sum_value = vdupq_n_qs16(0);
+std::pair<Status, Window> validate_and_configure_window_logits_softmax(ITensorInfo &input, ITensorInfo &max,
+                                                                       ITensorInfo &output, ITensorInfo &tmp)
+{
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type());
 
-        // Get max value
-        const auto      max_ptr = reinterpret_cast<const qint8_t *>(_max.ptr());
-        const qint8x8_t vec_max = vdup_n_qs8(*max_ptr);
+    // Output auto initialization if not yet initialized
+    const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info();
+    auto_init_if_empty(output, TensorInfo(input).set_quantization_info(output_quantization).reset_padding());
 
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            qint8x8_t vec_elements = vld1_qs8(in_ptr);
-            vec_elements           = vqsub_qs8(vec_elements, vec_max);
-            vec_elements           = vqexp_qs8(vec_elements, fixed_point_position);
+    // Tmp auto initialization if not yet initialized
+    const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type();
+    auto_init_if_empty(tmp, TensorInfo(input).set_data_type(tmp_data_type).reset_padding());
 
-            vst1_qs8(exp_ptr, vec_elements);
-            vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements));
+    const int input_width = input.valid_region().shape.x();
 
-            in_ptr += step;
-            exp_ptr += step;
-        }
-        // Reduce sum
-        const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value));
-        const qint16_t   sum0    = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1));
-        const qint16_t   sum1    = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3));
-        qint16_t         sum     = sqadd_qs16(sum0, sum1);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
-        {
-            qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position);
-            exp_ptr[i]      = element;
-            sum             = sqadd_qs16(sum, element);
-        }
+    Window win = calculate_max_window(max);
 
-        *(reinterpret_cast<qint8_t *>(_sum.ptr())) = sqmovn_qs16(sum);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
-}
-void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
-{
-    ARM_COMPUTE_UNUSED(beta);
+    AccessWindowHorizontal input_access(&input, input.valid_region().anchor.x(), input_width);
+    AccessWindowHorizontal max_access(&input, 0, 1);
+    AccessWindowHorizontal output_access(&output, input.valid_region().anchor.x(), input_width);
+    AccessWindowHorizontal tmp_access(&tmp, input.valid_region().anchor.x(), input_width);
 
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    const bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, tmp_access);
 
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
+    output.set_valid_region(input.valid_region());
 
-    constexpr int step                 = 4;
-    const int     long_steps           = in->info()->valid_region().shape.x() / step;
-    const int     small_steps          = in->info()->valid_region().shape.x() % step;
-    const int     fixed_point_position = in->info()->fixed_point_position();
+    const Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 
-    do
+template <typename T, int N, int S, int E>
+struct reduce_add_impl
+{
+    template <typename F>
+    static T reduce(F add_fn, vec_n_t<T, N> vec)
+    {
+        constexpr int H            = (S + E + 1) / 2;
+        const auto    reduced_high = reduce_add_impl < T, N, S, H - 1 >::reduce(add_fn, vec);
+        const auto    reduced_low  = reduce_add_impl<T, N, H, E>::reduce(add_fn, vec);
+        return add_fn(reduced_high, reduced_low);
+    }
+};
+template <typename T, int N, int I>
+struct reduce_add_impl<T, N, I, I>
+{
+    template <typename F>
+    static T reduce(F /*add_fn*/, vec_n_t<T, N> vec)
     {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
+        return vget_lane<I>(vec);
+    }
+};
+template <typename V, typename F>
+elem_type_t<V> reduce_add(F add_fn, V vec)
+{
+    constexpr int N = vec_size_of(vec);
+    return reduce_add_impl < elem_type_t<V>, N, 0, N - 1 >::reduce(add_fn, vec);
+}
 
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const qint16_t *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<qint16_t *>(exp.ptr());
+void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window)
+{
+    const int start_x     = in.info()->valid_region().anchor.x();
+    const int input_width = in.info()->valid_region().shape.x();
 
-        // Init sum to zero
-        qint32x4_t vec_sum_value = vdupq_n_qs32(0);
+    const float scale_beta = -beta * in.info()->quantization_info().scale;
 
-        // Get max value
-        const auto       max_ptr = reinterpret_cast<const qint16_t *>(_max.ptr());
-        const qint16x4_t vec_max = vdup_n_qs16(*max_ptr);
+    Iterator in_it(&in, window);
+    Iterator max_it(&max, window);
+    Iterator out_it(&out, window);
 
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
-        {
-            qint16x4_t vec_elements = vld1_qs16(in_ptr);
-            vec_elements            = vqsub_qs16(vec_elements, vec_max);
-            vec_elements            = vqexp_qs16(vec_elements, fixed_point_position);
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const qasymm8_t *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<qasymm8_t *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-            vst1_qs16(exp_ptr, vec_elements);
-            vec_sum_value = vqaddq_qs32(vec_sum_value, vmovl_s16(vec_elements));
+        float sum_inversed;
 
-            in_ptr += step;
-            exp_ptr += step;
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const qasymm8_t *>(max_it.ptr());
+            const auto vec_max = vdup_n<vec_16_byte_t<qasymm8_t>>(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = vdup_n<float32x4x4_t>(0.f);
+
+            /* Loop over row and compute exponentials and sum */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_max);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                auto vec_elements = vld<vec_16_byte_t<qasymm8_t>>(in_ptr + i);
+                vec_elements      = vsubq_u8(vec_max, vec_elements);
+
+                auto vec_elements_flt = vcvt<float32x4x4_t>(vec_elements);
+                vec_elements_flt      = vexp(vmul_n(vec_elements_flt, scale_beta));
+
+                vec_sum = vadd(vec_sum, vec_elements_flt);
+
+                vst4q_f32(tmp_ptr + i, vec_elements_flt);
+            }
+            /* Reduce sum */
+            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]),
+                                               vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+            const auto sum_8_byte = vadd_f32(vget_low(sum_16_byte), vget_high(sum_16_byte));
+            float      sum        = reduce_add(std::plus<float>(), sum_8_byte);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                const float element = std::exp((max_val - in_ptr[i]) * scale_beta);
+                sum += element;
+                tmp_ptr[i] = element;
+            }
+
+            sum_inversed = 256.f / sum;
         }
-        // Reduce sum
-        qint32x2_t carry_addition = vqadd_qs32(vget_high_s32(vec_sum_value), vget_low_s32(vec_sum_value));
-        qint32_t   sum            = vget_lane_s32(carry_addition, 0) + vget_lane_s32(carry_addition, 1);
 
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
+        /* Normalize exponentials */
         {
-            qint16_t element = sqexp_qs16(sqsub_qs16(in_ptr[i], *max_ptr), fixed_point_position);
-            exp_ptr[i]       = element;
-            sum              = sqadd_qs32(sum, element);
+            /* Loop over row and compute softmax */
+            int i = 0;
+            {
+                constexpr int vec_size = 16;
+                for(; i <= (input_width - vec_size); i += vec_size)
+                {
+                    float32x4x4_t vec_in           = vld4q_f32(tmp_ptr + i);
+                    auto          normalized_value = vcvt<vec_16_byte_t<qasymm8_t>>(vmul_n(vec_in, sum_inversed));
+                    vst(out_ptr + i, normalized_value);
+                }
+            }
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                out_ptr[i] = utility::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
+            }
         }
-
-        *(reinterpret_cast<qint16_t *>(_sum.ptr())) = sqmovn_qs32(sum);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+    },
+    in_it, max_it, out_it);
 }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
+template <typename T, typename U>
+void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp,
+                                   ITensor &out, const float /*beta*/, const Window &window)
 {
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    const int start_x     = in.info()->valid_region().anchor.x();
+    const int input_width = in.info()->valid_region().shape.x();
 
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
+    const int fixed_point_position = in.info()->fixed_point_position();
 
-    constexpr int step        = 8;
-    const int     long_steps  = in->info()->valid_region().shape.x() / step;
-    const int     small_steps = in->info()->valid_region().shape.x() % step;
+    Iterator in_it(&in, window);
+    Iterator max_it(&max, window);
+    Iterator out_it(&out, window);
 
-    do
+    execute_window_loop(window, [&](const Coordinates &)
     {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<float16_t *>(exp.ptr());
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
 
-        // Init sum to zero
-        float16x8_t vec_sum_value = vdupq_n_f16(0);
+        vec_16_byte_t<T> vec_sum_inversed;
 
-        // Get max value
-        const auto        max_ptr = reinterpret_cast<const float16_t *>(_max.ptr());
-        const float16x8_t vec_max = vdupq_n_f16(*max_ptr);
-
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
+        /* Compute exponentials and sum */
         {
-            float16x8_t vec_elements = vld1q_f16(in_ptr);
-            vec_elements             = vsubq_f16(vec_elements, vec_max);
-            vec_elements             = vmulq_n_f16(vec_elements, beta);
-            vec_elements             = vexpq_f16(vec_elements);
-
-            vst1q_f16(exp_ptr, vec_elements);
-            vec_sum_value = vaddq_f16(vec_sum_value, vec_elements);
-
-            in_ptr += step;
-            exp_ptr += step;
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+            const auto vec_max = vdup_n<vec_8_byte_t<T>>(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = vdup_n<vec_16_byte_t<U>>(0);
+
+            /* Loop over row and compute exponentials and sum */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_sum);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                auto vec_elements = vld<vec_8_byte_t<T>>(in_ptr + i);
+                vec_elements      = vqsub(vec_elements, vec_max);
+                vec_elements      = vqexp(vec_elements, fixed_point_position);
+                vec_sum           = vqadd(vec_sum, vmovl(vec_elements));
+                vst(tmp_ptr + i, vec_elements);
+            }
+            /* Reduce sum */
+            const vec_8_byte_t<U> sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum));
+            U                     sum        = reduce_add(sqadd<U>, sum_8_byte);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                T element  = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position);
+                sum        = sqadd<U>(sum, element);
+                tmp_ptr[i] = element;
+            }
+
+            const auto qsum  = utility::saturate_cast<T>(sum);
+            vec_sum_inversed = vqrecip(vdup_n<vec_16_byte_t<T>>(qsum), fixed_point_position);
         }
-        // Reduce sum
-        const float16x4_t sum_red        = vadd_f16(vget_low_f16(vec_sum_value), vget_high_f16(vec_sum_value));
-        const float16x4_t carry_addition = vpadd_f16(sum_red, sum_red);
-        float16_t         sum            = vget_lane_f16(carry_addition, 0) + vget_lane_f16(carry_addition, 1);
 
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
+        /* Normalize exponentials */
         {
-            const float16_t element = std::exp(static_cast<float>(in_ptr[i] - *max_ptr) * beta);
-            exp_ptr[i]              = element;
-            sum += element;
+            /* Loop over row and compute softmax */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_sum_inversed);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                const auto             vec_in           = vld<vec_16_byte_t<T>>(tmp_ptr + i);
+                const vec_16_byte_t<T> normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position);
+                vst(out_ptr + i, normalized_value);
+            }
+
+            const T sum_inversed = vget_lane<0>(vec_sum_inversed);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position);
+            }
         }
-        *(reinterpret_cast<float16_t *>(_sum.ptr())) = sum;
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+    },
+    in_it, max_it, out_it);
 }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
+template <typename T>
+void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp,
+                             ITensor &out, const float beta, const Window &window)
 {
-    Window window_max(window);
-    window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
+    const int start_x     = in.info()->valid_region().anchor.x();
+    const int input_width = in.info()->valid_region().shape.x();
 
-    Window max_slice = window_max.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
+    Iterator in_it(&in, window);
+    Iterator max_it(&max, window);
+    Iterator out_it(&out, window);
 
-    constexpr int step        = 4;
-    const int     long_steps  = in->info()->valid_region().shape.x() / step;
-    const int     small_steps = in->info()->valid_region().shape.x() % step;
-
-    do
+    execute_window_loop(window, [&](const Coordinates &)
     {
-        Iterator input(in, in_slice);
-        Iterator exp(out, in_slice);
-        Iterator _max(max, max_slice);
-        Iterator _sum(sum, max_slice);
-
-        // Get pointers
-        auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-        auto exp_ptr = reinterpret_cast<float *>(exp.ptr());
-
-        // Init sum to zero
-        float32x4_t vec_sum_value = vdupq_n_f32(0.0f);
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
 
-        // Get max value
-        const auto        max_ptr = reinterpret_cast<const float *>(_max.ptr());
-        const float32x4_t vec_max = vdupq_n_f32(*max_ptr);
+        T sum_inversed;
 
-        // Run neon loop
-        for(int i = 0; i < long_steps; ++i)
+        /* Compute exponentials and sum */
         {
-            float32x4_t vec_elements = vld1q_f32(in_ptr);
-            vec_elements             = vsubq_f32(vec_elements, vec_max);
-            vec_elements             = vmulq_n_f32(vec_elements, beta);
-            vec_elements             = vexpq_f32(vec_elements);
-
-            vst1q_f32(exp_ptr, vec_elements);
-            vec_sum_value = vaddq_f32(vec_elements, vec_sum_value);
-
-            in_ptr += step;
-            exp_ptr += step;
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+            const auto vec_max = vdup_n<vec_16_byte_t<T>>(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = vdup_n<vec_16_byte_t<T>>(0);
+
+            /* Loop over row and compute exponentials and sum */
+            int           i        = 0;
+            constexpr int vec_size = vec_size_of(vec_sum);
+            for(; i <= (input_width - vec_size); i += vec_size)
+            {
+                auto vec_elements = vld<vec_16_byte_t<T>>(in_ptr + i);
+                vec_elements      = vsub(vec_elements, vec_max);
+                vec_elements      = vexp(vmul_n(vec_elements, beta));
+                vec_sum           = vadd(vec_sum, vec_elements);
+                vst(tmp_ptr + i, vec_elements);
+            }
+            /* Reduce sum */
+            const auto sum_8_byte = vadd(vget_high(vec_sum), vget_low(vec_sum));
+            T sum                 = reduce_add([](T a, T b) -> T { return a + b; }, sum_8_byte);
+
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                T element = std::exp((in_ptr[i] - max_val) * beta);
+                sum += element;
+                tmp_ptr[i] = element;
+            }
+
+            sum_inversed = T(1) / sum;
         }
 
-        // Reduce sum
-        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
-        carry_addition             = vpadd_f32(carry_addition, carry_addition);
-        float sum                  = vget_lane_f32(carry_addition, 0);
-
-        // Run remaining elements
-        for(int i = 0; i < small_steps; ++i)
+        /* Normalize exponentials */
         {
-            float element = std::exp((in_ptr[i] - *max_ptr) * beta);
-            exp_ptr[i]    = element;
-            sum += element;
+            /* Loop over row and compute softmax */
+            int i = 0;
+            {
+                constexpr int vec_size = vec_size_of(vec_16_byte_t<T> {});
+                for(; i <= (input_width - vec_size); i += vec_size)
+                {
+                    auto             vec_in           = vld<vec_16_byte_t<T>>(tmp_ptr + i);
+                    vec_16_byte_t<T> normalized_value = vmul_n(vec_in, sum_inversed);
+                    vst(out_ptr + i, normalized_value);
+                }
+            }
+            /* Run remaining elements */
+            for(; i < input_width; ++i)
+            {
+                out_ptr[i] = tmp_ptr[i] * sum_inversed;
+            }
         }
-
-        *(reinterpret_cast<float *>(_sum.ptr())) = sum;
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
+    },
+    in_it, max_it, out_it);
 }
-} //namespace
+} // namespace
 
-NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
-    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr), _beta(1.0f)
+NELogits1DSoftmaxKernel::NELogits1DSoftmaxKernel()
+    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr)
 {
 }
 
-void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta)
+void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), max->info(), output->info(), tmp->info());
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info(), beta));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*input->info(), *max->info(), *output->info(), beta, *tmp->info()));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_logits_softmax(*input->info(), *max->info(), *output->info(), *tmp->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     switch(input->info()->data_type())
     {
+        case DataType::QASYMM8:
+            _func = &logits_1d_softmax_qasymm8;
+            break;
         case DataType::QS8:
-            _func = &logits_1d_shift_exp_sum_qs8;
+            _func = &logits_1d_softmax_fixed_point<qint8_t, qint16_t>;
             break;
         case DataType::QS16:
-            _func = &logits_1d_shift_exp_sum_qs16;
-            break;
-        case DataType::F32:
-            _func = &logits_1d_shift_exp_sum_f32;
+            _func = &logits_1d_softmax_fixed_point<qint16_t, qint32_t>;
             break;
-        case DataType::F16:
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &logits_1d_shift_exp_sum_f16;
+        case DataType::F16:
+            _func = &logits_1d_softmax_float<float16_t>;
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            _func = &logits_1d_softmax_float<float>;
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
             break;
@@ -686,224 +930,37 @@ void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor
     _input  = input;
     _max    = max;
     _output = output;
-    _sum    = sum;
     _beta   = beta;
+    _tmp    = tmp;
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NELogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta)
+Status NELogits1DSoftmaxKernel::validate(const ITensorInfo *input, const ITensorInfo *max,
+                                         const ITensorInfo *output, const float beta, const ITensorInfo *tmp)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_shift_exp_sum(input, max, output, sum, beta));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_shift_exp_sum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*input, *max, *output, beta, *tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_softmax(*input->clone(), *max->clone(), *output->clone(), *tmp->clone()).first);
 
     return Status{};
 }
 
-void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info)
+void NELogits1DSoftmaxKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _max, _output, _sum, window, _beta);
-}
+    const unsigned int num_elems_processed_per_iteration = _input->info()->valid_region().shape.x();
+    const unsigned int tmp_size_for_thread               = _tmp->info()->element_size() * num_elems_processed_per_iteration;
 
-namespace
-{
-void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
+    ARM_COMPUTE_ERROR_ON(_tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
-    const int fixed_point_position = in->info()->fixed_point_position();
+    void *tmp_for_thread = _tmp->buffer() + (info.thread_id * tmp_size_for_thread);
 
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const int8_t     sum_value        = *reinterpret_cast<const qint8_t *>(_sum.ptr());
-        const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const qint8_t *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<qint8_t *>(output.ptr());
-
-            const qint8x16_t vec_in           = vld1q_qs8(in_ptr);
-            const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position);
-
-            vst1q_qs8(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+    (*_func)(*_input, *_max, tmp_for_thread, *_output, _beta, window);
 }
-void logits_1d_norm_qs16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
 
-    const int fixed_point_position = in->info()->fixed_point_position();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const int16_t    sum_value        = *reinterpret_cast<const qint16_t *>(_sum.ptr());
-        const qint16x8_t vec_sum_inversed = vqrecipq_qs16(vdupq_n_qs16(sum_value), fixed_point_position);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const qint16_t *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<qint16_t *>(output.ptr());
-
-            const qint16x8_t vec_in           = vld1q_qs16(in_ptr);
-            const qint16x8_t normalized_value = vqmulq_qs16(vec_in, vec_sum_inversed, fixed_point_position);
-
-            vst1q_qs16(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void logits_1d_norm_f16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const float16_t   sum_value        = *reinterpret_cast<const qint16_t *>(_sum.ptr());
-        const float16x8_t vec_sum_inversed = vdupq_n_f16(1.0f / sum_value);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-            const float16x8_t vec_in           = vld1q_f16(in_ptr);
-            const float16x8_t normalized_value = vmulq_f16(vec_in, vec_sum_inversed);
-
-            vst1q_f16(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
-{
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-    Window sum_slice = window_sum.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_1D();
-
-    do
-    {
-        Iterator input(in, in_slice);
-        Iterator _sum(sum, sum_slice);
-        Iterator output(out, in_slice);
-
-        const float       sum_value        = *reinterpret_cast<const float *>(_sum.ptr());
-        const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value);
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            const auto in_ptr  = reinterpret_cast<const float *>(input.ptr());
-            const auto out_ptr = reinterpret_cast<float *>(output.ptr());
-
-            const float32x4_t vec_in           = vld1q_f32(in_ptr);
-            const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed);
-
-            vst1q_f32(out_ptr, normalized_value);
-        },
-        input, output);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
-}
-} // namespace
-
-NELogits1DNormKernel::NELogits1DNormKernel()
-    : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr)
-{
-}
-
-void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_norm(input->info(), sum->info(), output->info()));
-
-    _input  = input;
-    _sum    = sum;
-    _output = output;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::QS8:
-            _func = &logits_1d_norm_qs8;
-            break;
-        case DataType::QS16:
-            _func = &logits_1d_norm_qs16;
-            break;
-        case DataType::F32:
-            _func = &logits_1d_norm_f32;
-            break;
-        case DataType::F16:
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            _func = &logits_1d_norm_f16;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-            break;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window_logits_1d_norm(input->info(), sum->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NELogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_norm(input, sum, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_norm(input->clone().get(), sum->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input, _sum, _output, window);
-}
+} // namespace arm_compute
diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 0150a95cc6..836c3794c2 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp
@@ -92,7 +92,7 @@ std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
     return clone_obj;
 }
 
-ITensorInfo &SubTensorInfo::set_tensor_shape(TensorShape shape)
+ITensorInfo &SubTensorInfo::set_tensor_shape(const TensorShape &shape)
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
 
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 24988e2217..bd0c85f3d4 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -348,7 +348,7 @@ ITensorInfo &TensorInfo::set_format(Format format)
     return *this;
 }
 
-ITensorInfo &TensorInfo::set_tensor_shape(TensorShape shape)
+ITensorInfo &TensorInfo::set_tensor_shape(const TensorShape &shape)
 {
     _tensor_shape                  = shape;
     _offset_first_element_in_bytes = 0;
@@ -378,7 +378,7 @@ ITensorInfo &TensorInfo::set_fixed_point_position(int fixed_point_position)
     return *this;
 }
 
-ITensorInfo &TensorInfo::set_quantization_info(QuantizationInfo quantization_info)
+ITensorInfo &TensorInfo::set_quantization_info(const QuantizationInfo &quantization_info)
 {
     _quantization_info = quantization_info;
     return *this;
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 8e6773c5b1..4fb83007c5 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 using namespace arm_compute;
 
 NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _fill_border_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp()
 {
 }
 
@@ -40,31 +40,22 @@ void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Create intermediate tensors shapes
-    TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
-    _tmp.allocator()->init(tensor_info_tmp);
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
+    _softmax_kernel.configure(input, &_max, output, beta, &_tmp);
 
-    TensorShape shape = input->info()->tensor_shape();
-    shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
-    _max.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->init(tensor_info_max_sum);
+    // Init intermediate tensors
+    _max.allocator()->init(*_max.info());
+    _tmp.allocator()->init(*_tmp.info());
 
     // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
     _memory_group.manage(&_max);
-    _memory_group.manage(&_sum);
-
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
-    _norm_kernel.configure(&_tmp, &_sum, output);
-    _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
+    _memory_group.manage(&_tmp);
 
     // Allocate intermediate tensors
-    _tmp.allocator()->allocate();
     _max.allocator()->allocate();
-    _sum.allocator()->allocate();
+    _tmp.allocator()->allocate();
 }
 
 Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
@@ -72,14 +63,12 @@ Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape max_sum_shape = input->tensor_shape();
-    max_sum_shape.set(0, 1);
-
-    TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape));
+    const TensorShape max_shape           = TensorShape(input->tensor_shape()).set(0, 1);
+    const TensorInfo  tensor_info_max_sum = TensorInfo(*input).set_tensor_shape(max_shape).reset_padding();
+    const TensorInfo  dont_care;
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(input, &tensor_info_max_sum, output, beta, &dont_care));
 
     return Status{};
 }
@@ -90,8 +79,7 @@ void NESoftmaxLayer::run()
 
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_shift_exp_sum_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
 
     _memory_group.release();
 }
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 0b688dfd1b..9b9f1fdce2 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,9 @@ constexpr AbsoluteTolerance<float> tolerance_f16(0.0001f);
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<int16_t> tolerance_fixed_point(2);
 
+/** Tolerance for quantized operations */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
 {
@@ -90,7 +93,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datase
     const int         step    = 16 / data_size_from_type(data_type);
     const PaddingSize padding = PaddingCalculator(shape.x(), step).required_padding();
     validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
+    validate(dst.info()->padding(), PaddingSize());
 }
 
 // *INDENT-OFF*
@@ -159,7 +162,7 @@ TEST_SUITE_END()
 template <typename T>
 using NESoftmaxLayerFixedPointFixture = SoftmaxValidationFixedPointFixture<Tensor, Accessor, NESoftmaxLayer, T>;
 
-TEST_SUITE(Quantized)
+TEST_SUITE(FixedPoint)
 TEST_SUITE(QS8)
 // Testing for fixed point position [1,6) as reciprocal limits the maximum fixed point position to 5
 FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType",
@@ -199,6 +202,30 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixedPointFixture<int16_t>, frame
 TEST_SUITE_END()
 TEST_SUITE_END()
 
+template <typename T>
+using NESoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor, Accessor, NESoftmaxLayer, T>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                               combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
author	Diego Lopez Recas <Diego.LopezRecas@arm.com>	2017-12-04 18:56:10 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:45:00 +0000
commit	35ceeb2199c569810a1524a0a21c2df2a3f5f29e (patch)
tree	4a55f8626cb2960843547fabdb2431a70ec1029a
parent	97cf2497d2b617de3209330893ad51bd0cc126ce (diff)
download	ComputeLibrary-35ceeb2199c569810a1524a0a21c2df2a3f5f29e.tar.gz