From d6afedc775220f17317f1835a4d18b72a54525de Mon Sep 17 00:00:00 2001
From: Chunosov <N.Chunosov@yandex.ru>
Date: Mon, 6 Nov 2017 22:09:45 +0700
Subject: COMPMID-661: softmax-fp32 optimisation (#14)

Change-Id: I2007af1ed9dcf68065cf412aa50f73a2025b31a6
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94605
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
---
 arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h |  58 ++-
 arm_compute/runtime/CL/functions/CLSoftmaxLayer.h  |  18 +-
 src/core/CL/CLKernelLibrary.cpp                    |   2 +
 src/core/CL/cl_kernels/fixed_point.h               |   5 +
 src/core/CL/cl_kernels/helpers.h                   |   3 +
 src/core/CL/cl_kernels/softmax_layer.cl            | 487 +++++++++++++++++++++
 src/core/CL/kernels/CLSoftmaxLayerKernel.cpp       | 131 ++++++
 src/runtime/CL/functions/CLSoftmaxLayer.cpp        |  36 +-
 tests/datasets/ShapeDatasets.h                     |  34 ++
 tests/validation/CL/SoftmaxLayer.cpp               |  33 +-
 10 files changed, 778 insertions(+), 29 deletions(-)
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index 1e079cbb06..675c462c95 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -26,6 +26,8 @@
 
 #include "arm_compute/core/CL/ICLSimple3DKernel.h"
 
+#include <tuple>
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -42,7 +44,7 @@ public:
     void configure(const ICLTensor *input, ICLTensor *output);
 };
 
-/** Interface for shifting the logits values around the max value and exponentiating the result */
+/** Interface for shifting, exponentiating and summing the logits */
 class CLLogits1DShiftExpSumKernel : public ICLKernel
 {
 public:
@@ -60,9 +62,9 @@ public:
      *
      * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
      * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[in]  beta   A scaling factor for the exponent.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
      */
     void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
 
@@ -76,6 +78,58 @@ private:
     ICLTensor       *_sum;
 };
 
+/** Interface for max, shifting, exponentiating and summing the logits */
+class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
+{
+public:
+    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
+
+public:
+    /** Default constructor */
+    CLLogits1DMaxShiftExpSumKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]     input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in,out] max    Max values tensor. Data types supported: same as @p input
+     * @param[out]    output Destination tensor. Data types supported: same as @p input
+     * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
+     * @param[in]     beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
+     */
+    void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
+    /** Checks if the given size is eligible for parallel reduction
+     *
+     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
+     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
+     *
+     * @param[in] size Size to check
+     *
+     * @return A two-element tuple where the first element is a boolean specifying is a parallel reduction will be run,
+     *         while the second elements is the vector size of the execution.
+     */
+    static ParallelReductionInfo is_parallel_reduction(size_t size);
+
+    // Inherited methods overridden:
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_max;
+    ICLTensor       *_output;
+    ICLTensor       *_sum;
+
+private:
+    static const unsigned int _grid_size;
+    static const unsigned int _serial_vector_size;
+    static const unsigned int _parallel_vector_size;
+};
 /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
 class CLLogits1DNormKernel : public ICLKernel
 {
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index d84297e9a1..72ef679d6a 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -54,8 +54,8 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
-     * @param[in]  beta   A scaling factor for the exponent.
      * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
      */
     void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f);
 
@@ -63,13 +63,15 @@ public:
     void run() override;
 
 private:
-    CLMemoryGroup               _memory_group;
-    CLLogits1DMaxKernel         _max_kernel;
-    CLLogits1DShiftExpSumKernel _shift_exp_sum_kernel;
-    CLLogits1DNormKernel        _norm_kernel;
-    CLTensor                    _max;
-    CLTensor                    _sum;
-    CLTensor                    _tmp;
+    CLMemoryGroup                  _memory_group;
+    CLLogits1DMaxKernel            _max_kernel;
+    CLLogits1DShiftExpSumKernel    _shift_exp_sum_kernel;
+    CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
+    CLLogits1DNormKernel           _norm_kernel;
+    CLTensor                       _max;
+    CLTensor                       _sum;
+    CLTensor                       _tmp;
+    bool                           _run_legacy_path;
 };
 }
 #endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 32199525b0..6efeebd63f 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -300,6 +300,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "softmax_layer_max", "softmax_layer.cl" },
     { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
+    { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
+    { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "suppress_non_maximum", "canny.cl" },
     { "tablelookup_U8", "tablelookup.cl" },
     { "tablelookup_S16", "tablelookup.cl" },
diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index 5476a6e070..b329118f14 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h
@@ -359,7 +359,12 @@ DIVQ_SAT_IMPL(qs16, qs16, qs32)
         return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), clz(sum) > dec_m); /* Saturate result if needed */ \
     }
 
+EXPQ_IMPL(qs8, qs8x2, 2)
+EXPQ_IMPL(qs8, qs8x4, 4)
+EXPQ_IMPL(qs8, qs8x8, 8)
 EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x2, 2)
+EXPQ_IMPL(qs16, qs16x4, 4)
 EXPQ_IMPL(qs16, qs16x8, 8)
 EXPQ_IMPL(qs16, qs16x16, 16)
 
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 330d67daa5..768f7ee434 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -45,6 +45,9 @@
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
 #define CONVERT_STR(x, type) (convert_##type((x)))
 #define CONVERT(x, type) CONVERT_STR(x, type)
 
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 010135eb7b..5bc43ef144 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -57,8 +57,36 @@
 
 #endif /* FIXED_POINT_POSITION */
 
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+/* Vector size, i.e. number of vector elements. */
+#if VECTOR_SIZE == 2
+__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
+__constant uint2 idx__ = (uint2)(0, 1);
+
+#elif VECTOR_SIZE == 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
+
+#elif VECTOR_SIZE == 8
+__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
+__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+#else /* VECTOR_SIZE DEFAULT */
+#define VECTOR_SIZE 16
+#define LOG_VECTOR_SIZE 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+#endif /* VECTOR_SIZE END */
+
+// TODO (COMPMID-661): Remove if the non-fused kernels are removed
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant uint4 idx4   = (uint4)(0, 1, 2, 3);
 
 /** Identifies the maximum value across the 1st dimension.
  *
@@ -277,3 +305,462 @@ __kernel void softmax_layer_norm(
     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
     vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 }
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA_VAL;
+#endif /* BETA */
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
+
+    // Calculate max of row
+    const uint width_ = width >> LOG_VECTOR_SIZE;
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data_max    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
+    widx        = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform max reduction
+#if VECTOR_SIZE == 16
+    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+    // Store result
+    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+
+    /* Second section */
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    data = select(0, data, widx);
+    VSTORE(VECTOR_SIZE)
+    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+    sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform sum reduction
+#if VECTOR_SIZE == 16
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    const uint lid = get_local_id(0);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+#endif /* BETA */
+
+    // Define one temporary vector per work-item.
+    __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
+    // Number of elements per work-item.
+    const uint row = width / GRID_SIZE;
+    // Number of iterations per work-item.
+    const uint width_ = row >> 2;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    //TODO: Optimize this calculation (avoid %).
+    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of 4
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx        = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = max_val_vec;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec     = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+        max_local       = max_val_vec.s0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    sum1D             = 0;
+    DATA_TYPE max_val = max_local;
+
+    // Shift values, exp and sum
+    for(i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    //TODO: Optimize the calculation (avoid %).
+    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = select(0, data, widx);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        // Perform max reduction
+        sum1D.s01                        = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+        sum1D.s0                         = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+        *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+    }
+}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 1b89161e24..6b42e18132 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -185,6 +185,137 @@ void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &qu
     while(window_collapsed.slide_window_slice_3D(slice));
 }
 
+/**< Grid size (obtained through auto-tuning) */
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
+/**< Vector size in the serial case (obtained through auto-tuning) */
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
+/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
+
+CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+    ARM_COMPUTE_ERROR_ON(beta != 1.0f && input->info()->data_type() != DataType::F32);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    const DataType dt                 = input->info()->data_type();
+    const size_t   reduction_dim_size = input->info()->dimension(0);
+    auto           beta_int           = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+    build_opts.add_option_if(is_data_type_fixed_point(dt),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
+    build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+
+    // Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
+    // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
+    _lws_hint                                     = cl::NullRange;
+    std::string           kernel_name             = std::string("softmax_layer_max_shift_exp_sum_serial");
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
+    unsigned int          vector_size             = std::get<1>(parallel_reduction_info);
+
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
+    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
+
+    // Configure parallel kernel if needed
+    if(std::get<0>(parallel_reduction_info))
+    {
+        kernel_name            = std::string("softmax_layer_max_shift_exp_sum_parallel");
+        bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
+        build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
+
+        // Handle boundary conditions.
+        const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
+        build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
+    }
+
+    // Create kernel.
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments. Both the kernels use the same arguments
+    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, reduction_dim_size);
+
+    // Configure window
+    const unsigned int num_elems_x = ceil_to_multiple(input->info()->tensor_shape().x(), vector_size);
+    Window             win         = calculate_max_window(*input->info(), Steps(num_elems_x));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_x);
+    AccessWindowHorizontal max_access(max->info(), 0, 1);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_x);
+    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
+{
+    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
+    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
+    return std::make_tuple(is_parallel_reduction, vector_size);
+}
+
+void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Collapse window in Z dimension
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    // Reconfigure window in case of parallel reduction
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
+    if(std::get<0>(parallel_reduction_info))
+    {
+        // To launch grid_size parallel workitems, steps.x should be modified as follows.
+        const unsigned int step = std::get<1>(parallel_reduction_info);
+        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
+    }
+
+    // Get slices
+    Window slice = window_collapsed.first_slice_window_3D();
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _max, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _sum, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}
+
 CLLogits1DNormKernel::CLLogits1DNormKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr)
 {
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index fa324ee61d..7268d8eab5 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -23,15 +23,19 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _run_legacy_path(false)
 {
 }
 
@@ -48,14 +52,26 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
     _max.allocator()->init(tensor_info_max_sum);
     _sum.allocator()->init(tensor_info_max_sum);
 
+    // Set GPU target to kernels
+    _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
+
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
     _memory_group.manage(&_max);
     _memory_group.manage(&_sum);
 
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    // Configure kernels
+    // TODO (COMPMID-661): Remove legacy path once the new one is properly validated
+    _run_legacy_path = is_data_type_quantized_assymetric(input->info()->data_type());
+    if(_run_legacy_path)
+    {
+        _max_kernel.configure(input, &_max);
+        _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    }
+    else
+    {
+        _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    }
     _norm_kernel.configure(&_tmp, &_sum, output);
 
     // Allocate intermediate buffers
@@ -68,8 +84,16 @@ void CLSoftmaxLayer::run()
 {
     _memory_group.acquire();
 
-    CLScheduler::get().enqueue(_max_kernel, false);
-    CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    // Force to use the new fused kernel
+    if(_run_legacy_path)
+    {
+        CLScheduler::get().enqueue(_max_kernel, false);
+        CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
+    }
     CLScheduler::get().enqueue(_norm_kernel);
 
     _memory_group.release();
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 86ed2b2ad7..45f5d1c9ff 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -269,6 +269,40 @@ public:
     }
 };
 
+/** Data set containing small softmax layer shapes. */
+class SoftmaxLayerSmallShapes final : public ShapeDataset
+{
+public:
+    SoftmaxLayerSmallShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 9U, 9U },
+                     TensorShape{ 256U, 10U, 2U },
+                     TensorShape{ 353U, 8U, 2U, 2U },
+                     TensorShape{ 512U, 7U, 2U, 2U },
+                     TensorShape{ 633U, 10U, 1U, 2U },
+                     TensorShape{ 781U, 5U, 2U },
+    })
+    {
+    }
+};
+
+/** Data set containing large softmax layer shapes. */
+class SoftmaxLayerLargeShapes final : public ShapeDataset
+{
+public:
+    SoftmaxLayerLargeShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 1000U, 10U },
+                     TensorShape{ 3989U, 10U, 2U },
+                     TensorShape{ 4098U, 8U, 1U, 2U },
+                     TensorShape{ 7339U, 11U },
+    })
+    {
+    }
+};
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index c469b8acd8..7842c5c83b 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -62,7 +63,7 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 TEST_SUITE(CL)
 TEST_SUITE(SoftmaxLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), CNNDataTypes), shape, data_type)
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datasets::SoftmaxLayerSmallShapes(), datasets::SoftmaxLayerLargeShapes()), CNNDataTypes), shape, data_type)
 {
     // Set fixed point position data type allowed
     const int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
@@ -83,10 +84,16 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datase
     validate(src.info()->valid_region(), valid_region);
     validate(dst.info()->valid_region(), valid_region);
 
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
+    // Get reduction kernel info
+    CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
+
+    // Validate src padding
+    const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
+    validate(src.info()->padding(), padding_src);
+
+    // Validate dst padding
+    const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
+    validate(dst.info()->padding(), padding_dst);
 }
 
 template <typename T>
@@ -94,12 +101,12 @@ using CLSoftmaxLayerFixture = SoftmaxValidationFixture<CLTensor, CLAccessor, CLS
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::SoftmaxLayerLargeShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -107,12 +114,12 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::Dataset
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::SoftmaxLayerLargeShapes(), framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -126,14 +133,14 @@ using CLSoftmaxLayerFixedPointFixture = SoftmaxValidationFixedPointFixture<CLTen
 TEST_SUITE(Quantized)
 TEST_SUITE(QS8)
 // Testing for fixed point position [1,6) as reciprocal limits the maximum fixed point position to 5
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(), framework::dataset::make("DataType",
                                                                                                                      DataType::QS8)),
                                                                                                                      framework::dataset::make("FractionalBits", 1, 6)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed_point);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixedPointFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(), framework::dataset::make("DataType",
                                                                                                                    DataType::QS8)),
                                                                                                                    framework::dataset::make("FractionalBits", 1, 6)))
 {
@@ -144,7 +151,7 @@ TEST_SUITE_END()
 
 TEST_SUITE(QS16)
 // Testing for fixed point position [1,14) as reciprocal limits the maximum fixed point position to 14
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                       framework::dataset::make("DataType",
                                                                                                                               DataType::QS16)),
                                                                                                                       framework::dataset::make("FractionalBits", 1, 14)))
@@ -152,7 +159,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixedPointFixture<int16_t>, frame
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fixed_point);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                     framework::dataset::make("DataType",
                                                                                                                             DataType::QS16)),
                                                                                                                     framework::dataset::make("FractionalBits", 1, 14)))
-- 
cgit v1.2.1