Optimize CL softmax

* The new softmax implementation consists of only a single kernel. - There are 2 versions of softmax, one for the x dimension and one for any other dimensions. - Softmax kernel handles both native and quantized data type. Resolves: COMPMID-6447 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I4a9ae5bc63f78aebeaa85ee48a0d102c9c245eda Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10489 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-10-13 17:40:32 +0100
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-10-31 10:16:25 +0000
commit: 29254aeb11a76c86449c2f38587e9144b2f2aacb (patch)
tree: ca2df26e81c2417b34768ac325e0f7200b5265df
parent: e5362e7e5dbccf81c5296a7e77154e11e1a14d2f (diff)
download: ComputeLibrary-29254aeb11a76c86449c2f38587e9144b2f2aacb.tar.gz
11 files changed, 530 insertions, 1544 deletions
diff --git a/Android.bp b/Android.bp
index dc35143f76..7681205770 100644
--- a/Android.bp
+++ b/Android.bp
@@ -68,7 +68,6 @@ opencl_srcs = [
         "src/core/CL/cl_kernels/common/select.cl",
         "src/core/CL/cl_kernels/common/slice_ops.cl",
         "src/core/CL/cl_kernels/common/softmax_layer.cl",
-        "src/core/CL/cl_kernels/common/softmax_layer_quantized.cl",
         "src/core/CL/cl_kernels/common/stack_layer.cl",
         "src/core/CL/cl_kernels/common/tile.cl",
         "src/core/CL/cl_kernels/common/transpose.cl",
diff --git a/SConscript b/SConscript
index fab9b65acc..8685d01388 100644
--- a/SConscript
+++ b/SConscript
@@ -451,7 +451,6 @@ if env['opencl'] and env['embed_kernels']:
                        'src/core/CL/cl_kernels/common/select.cl',
                        'src/core/CL/cl_kernels/common/slice_ops.cl',
                        'src/core/CL/cl_kernels/common/softmax_layer.cl',
-                       'src/core/CL/cl_kernels/common/softmax_layer_quantized.cl',
                        'src/core/CL/cl_kernels/common/stack_layer.cl',
                        'src/core/CL/cl_kernels/common/tile.cl',
                        'src/core/CL/cl_kernels/common/transpose.cl',
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 70ef1f4402..68541e35c5 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
-#define ARM_COMPUTE_CLSOFTMAXLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
 
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -43,12 +43,6 @@ class CLCompileContext;
  *
  * Log Softmax is calculated by :
  * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
- *
- * This function runs the following operators/kernels:
- * -# If axis is not 0:
- * -# @ref opencl::ClPermute
- * -# @ref opencl::kernels::ClLogits1DNormKernel
- * -# @ref opencl::kernels::ClLogits1DMaxShiftExpSumKernel
  */
 template <bool IS_LOG = false>
 class CLSoftmaxLayerGeneric : public IFunction
@@ -115,4 +109,4 @@ private:
 using CLSoftmaxLayer    = CLSoftmaxLayerGeneric<false>;
 using CLLogSoftmaxLayer = CLSoftmaxLayerGeneric<true>;
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSOFTMAXLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index b2500944ca..2b8f5d87a1 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -57,6 +57,7 @@ v23.11 Public major release
    - Optimize @ref opencl::ClTranspose
    - Optimize @ref NEStackLayer
    - Optimize @ref CLReductionOperation.
+   - Optimize @ref CLSoftmaxLayer.
  - Add new OpenCL™ kernels:
    - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support
  - Deprecate support for Bfloat16 in @ref cpu::CpuCast.
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index 4d2d89dd73..58c458982d 100644
--- a/src/core/CL/cl_kernels/common/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,511 +21,344 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
+#define MIN_VALUE_float -FLT_MAX
+#define MIN_VALUE_half  -HALF_MAX
+#define MIN_VALUE_char  CHAR_MIN
+#define MIN_VALUE_uchar 0
+
+#define MIN_VALUE_TYPE_STR(data_type) MIN_VALUE_##data_type
+#define MIN_VALUE_TYPE(data_type) MIN_VALUE_TYPE_STR(data_type)
+#define MIN_VALUE MIN_VALUE_TYPE(DATA_TYPE)
+
+#ifdef SOFTMAX_X
 
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+/** 3-pass softmax in the x dimension.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
  */
-__kernel void softmax_layer_norm(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
+__kernel void softmax_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element
+
+#ifdef IS_QUANTIZED
+    ,
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element
+#endif // IS_QUANTIZED
+)
 {
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
+    const int dim_0 = get_global_id(0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
 
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+#ifdef IS_QUANTIZED
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+#else // IS_QUANTIZED
+    __global uchar *tmp_ptr = dst_ptr;
+#endif // IS_QUANTIZED
 
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    // Calculate max value.
+    DATA_TYPE max_value = MIN_VALUE;
+    int i = 0;
 
-#if defined(LOG_SOFTMAX)
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-#else  // defined(LOG_SOFTMAX)
-    data0 /= sum_val;
-#endif // defined(LOG_SOFTMAX)
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)));
 
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
+        max_value = max(max_value, MAX_REDUCE(data, VEC_SIZE));
+    }
 
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+    for (; i < LENGTH; ++i)
+    {
+        DATA_TYPE data = *(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE));
 
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
+        max_value = max(max_value, data);
+    }
 
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    // Regularize the data.
+    TMP_DATA_TYPE sum_value = 0;
 
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+#ifdef IS_QUANTIZED
+    TMP_DATA_TYPE max_value_f = (CONVERT(max_value, TMP_DATA_TYPE) - SRC_OFFSET) * SRC_SCALE;
+    TMP_DATA_TYPE regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
 
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
 
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
+        data = REGULARIZE(data);
 
-    // Initialize local maximum
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+#ifdef IS_LOG
+        sum_value += SUM_REDUCE(exp(data), VEC_SIZE);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += SUM_REDUCE(data, VEC_SIZE);
+#endif // IS_LOG
 
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+    }
 
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    for (; i < LENGTH; ++i)
     {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
+        TMP_DATA_TYPE data = CONVERT(*(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)), TMP_DATA_TYPE);
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)) = data;
     }
 
-    // Perform max reduction
-    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_val;
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    data -= max_val;
-#ifdef BETA
-    data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-#endif /* LOG_SOFTMAX */
-    sum1D += data;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
+# else // IS_LOG
+    TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
     {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result = NORMALIZE(VEC_SIZE, data);
+
+        VSTORE(VEC_SIZE)(result, 0, (__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)));
     }
 
-    // Perform sum reduction
-    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE));
+
+        DATA_TYPE result = NORMALIZE(1, data);
+
+        *(__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)) = result;
+    }
+
+#undef NORMALIZE
 }
 
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
+#endif // SOFTMAX_X
+
+#ifdef SOFTMAX_NON_X
+
+/** 3-pass softmax in any dimension higher than the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *   - VEC_SIZE_LEFTOVER: the size of the leftover part.
  *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
  *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
  */
-__kernel void softmax_layer_max_shift_exp_sum_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
+__kernel void softmax_non_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element,
+
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element,
+
+    uint src_stride_axis,
+    uint dst_stride_axis
+)
 {
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+    const int dim_0 = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
 
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+    // Calculate max value and store the input data to the temporary tensor in suitable format.
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
+    int i = 0;
 
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Define one temporary vector per work-item.
-    __local VEC_TYPE tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
+    for (i = 0; i < LENGTH; ++i)
     {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    SELECT_TYPE widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = max_val_vec;
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * src_stride_axis));
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+        max_value = max(max_value, data);
 
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
+        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE)));
     }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
 
-    /* Second section */
+    // Regularize the data.
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) sum_value = 0;
 
-    // Set sum vector
-    VEC_TYPE  sum1D   = 0;
-    DATA_TYPE max_val = max_local;
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) max_value_f = (CONVERT(max_value, VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)) - SRC_OFFSET) * SRC_SCALE;
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
 
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
+    for (i = LENGTH - 1; i >= 0; --i)
     {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
+        // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+        //
+        // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+        // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+        // to the second pass pass to reduce memory bandwidth of the first pass.
+        //
+        // This pass reads the quantized data from the temporary tensor and writes the dequantized data
+        // back to the temporary tensor, hence we need to loop in reverse to avoid overwriting unprocessed data.
+
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
         data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
+        sum_value += data;
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
     }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
+# else // IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, VEC_SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
     {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data = exp(data);
-        data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        data = select(0, data, widx);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result0 = NORMALIZE(data);
 
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+        STORE_VECTOR_SELECT(result, DATA_TYPE, dst_ptr + i * dst_stride_axis, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
     }
+
+#undef NORMALIZE
 }
 
-#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
-\ No newline at end of file
+#endif // SOFTMAX_NON_X
+
+#undef MIN_VALUE
+#undef MIN_VALUE_TYPE
+#undef MIN_VALUE_TYPE_STR
+
+#undef MIN_VALUE_float
+#undef MIN_VALUE_half
+#undef MIN_VALUE_char
+#undef MIN_VALUE_uchar
diff --git a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
deleted file mode 100644
index 192c5f97a1..0000000000
--- a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
-
-#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VECTOR_SIZE)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_FLOAT beta       = (VEC_FLOAT)BETA;
-    VEC_FLOAT scale_beta = -BETA * SCALE;
-#else  /* BETA */
-    VEC_FLOAT scale_beta = -SCALE;
-#endif /* BETA */
-
-    // Load max value of 1D logits vector (row)
-    float sum_val         = *((__global float *)offset(&sum, 0, get_global_id(1)));
-    float sum_val_inverse = 256.f / sum_val;
-
-    VEC_INT   data_diff   = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
-    VEC_FLOAT data_diff_f = CONVERT(data_diff, VEC_FLOAT);
-
-    data_diff_f *= scale_beta;
-    data_diff_f = exp(data_diff_f);
-    data_diff_f *= sum_val_inverse;
-
-#ifdef QASYMM8_SIGNED
-    data_diff_f -= 128.f;
-#endif /* QASYMM8_SIGNED */
-    VEC_INT  data  = CONVERT(data_diff_f, VEC_INT);
-    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-
-VEC_INT mult_by_quantized_multiplier(VEC_INT data)
-{
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-    return data;
-}
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_FLOAT beta       = (VEC_FLOAT)BETA;
-    VEC_FLOAT scale_beta = -BETA * SCALE;
-#else  /* BETA */
-    VEC_FLOAT scale_beta = -SCALE;
-#endif /* BETA */
-
-    // Calculate max of row
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_local;
-
-    // Second part
-
-    // Load max value of 1D logits vector (row)
-    int       max_val = convert_int(max_local);
-    VEC_FLOAT sum1D_f = 0.f;
-    // Start with the leftover items
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT   data_fp   = CONVERT(data, VEC_INT);
-    VEC_INT   data_diff = max_val - data_fp;
-    VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT);
-    data_fp_f *= scale_beta;
-    data_fp_f = exp(data_fp_f);
-    data_fp_f = select(0, data_fp_f, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data_diff, 0, (__global int *)dst_addr);
-    sum1D_f += data_fp_f;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-    // Do the rest and compute exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data       = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        VEC_INT   data_fp   = CONVERT(data, VEC_INT);
-        VEC_INT   data_diff = max_val - data_fp;
-        VEC_FLOAT data_fp_f = CONVERT(data_diff, VEC_FLOAT);
-        data_fp_f *= scale_beta;
-        data_fp_f = exp(data_fp_f);
-        sum1D_f += data_fp_f;
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
-    }
-    // Perform sum reduction
-    *((__global float *)sum.ptr) = SUM_REDUCE(sum1D_f, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    // Define one temporary vector per work-item.
-    __local VEC_INT tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE max_val_vec = vec_min_val;
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_INT sum1D   = 0;
-    int     max_val = convert_int(max_local);
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
-        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-        data_fp = select(0, data_fp, widx);
-        sum1D   = sum1D + data_fp;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index bcade94522..4544a66e39 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -346,12 +346,8 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
     {"select_same_rank", "common/select.cl"},
     {"select_different_rank_2", "common/select.cl"},
     {"select_different_rank_n", "common/select.cl"},
-    {"softmax_layer_norm", "common/softmax_layer.cl"},
-    {"softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl"},
-    {"softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl"},
-    {"softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl"},
-    {"softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl"},
-    {"softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl"},
+    {"softmax_x", "common/softmax_layer.cl"},
+    {"softmax_non_x", "common/softmax_layer.cl"},
     {"stack_layer", "common/stack_layer.cl"},
     {"strided_slice", "common/slice_ops.cl"},
     {"tile", "common/tile.cl"},
@@ -735,10 +731,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/common/softmax_layer.clembed"
     },
     {
-        "common/softmax_layer_quantized.cl",
-#include "./cl_kernels/common/softmax_layer_quantized.clembed"
-    },
-    {
         "common/slice_ops.cl",
 #include "./cl_kernels/common/slice_ops.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
index 1b5a2666bc..796345a923 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -23,361 +23,241 @@
  */
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
 
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
 
-#include "src/core/CL/CLValidate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
+#include <string>
+
 namespace arm_compute
 {
 namespace opencl
 {
 namespace kernels
 {
-namespace
-{
-/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
- *
- * Prepares these build options:
- * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
- * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
- *             it defines whether the value will be taken into account or not.
- *
- * @param[in] build_opts  Build options to extend
- * @param[in] input_scale Input scaling factor
- * @param[in] beta        Exponent scaling factor beta
- */
-CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
-{
-    // Number of integer bits in temporary fixed-point representation of current-to-max difference
-    static const int scaled_diff_int_bits = 5;
-    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
-    static const int exp_accumulation_in_bits = 12;
-
-    const double beta_multiplier =
-        std::min(1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), (1LL << 31) - 1.0);
-    int input_beta_multiplier;
-    int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier,
-                                                                  &input_beta_left_shift);
-
-    const double max_input_rescaled =
-        1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int diff_min = -1.f * std::floor(max_input_rescaled);
 
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
-    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
-    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
-    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
-    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
-
-    return build_opts;
+ClSoftmaxKernel::ClSoftmaxKernel()
+{
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src,
-                                           const ITensorInfo &max,
-                                           const ITensorInfo &dst,
-                                           const ITensorInfo &sum)
+Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
+    ARM_COMPUTE_UNUSED(src, dst, info);
 
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
+    ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);
 
-    // Checks performed when output is configured
-    if (dst.total_size() != 0)
-    {
-        if (is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
 
-    // Checks performed when sum is configured
-    if (sum.total_size() != 0)
-    {
-        if (is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&max, &sum);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&max, &sum);
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
+        &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
 
-    return Status{};
-}
+    ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
+    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
 
-Status validate_arguments_1DNorm(const ITensorInfo       &src,
-                                 const ITensorInfo       &sum,
-                                 const ITensorInfo       &dst,
-                                 const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &sum);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info =
-        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
-
-    // Checks performed when output is configured
-    if (dst.total_size() != 0)
+    if (is_data_type_quantized_asymmetric(src.data_type()))
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if (!is_quantized_asymmetric)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-            ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != allowed_quantization_info);
-        }
+        ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);
+
+        ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
+                                    get_softmax_output_quantization_info(src.data_type(), info.is_log));
     }
 
     return Status{};
 }
-} // namespace
-
-/**< Grid size (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_grid_size = 64;
-/**< Vector size in the serial case (obtained through auto-tuning) */
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
-/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
-
-ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
 
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext  &compile_context,
-                                               const ITensorInfo       &src,
-                                               ITensorInfo             &max,
-                                               ITensorInfo             &dst,
-                                               ITensorInfo             &sum,
-                                               const SoftmaxKernelInfo &info)
+void ClSoftmaxKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       &src,
+                                ITensorInfo             &dst,
+                                const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({&src, &max, &dst, &sum});
+    ARM_COMPUTE_UNUSED(compile_context, src, dst, info);
 
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
-    auto_init_if_empty(dst, *src.clone());
+    const auto &dst_shape = dst.tensor_shape();
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
+    const auto data_type    = src.data_type();
+    const auto element_size = src.element_size();
 
-    const DataType                dt                 = src.data_type();
-    const UniformQuantizationInfo qinfo              = src.quantization_info().uniform();
-    const size_t                  reduction_dim_size = src.dimension(0);
-    const float                   beta               = info.beta;
-    const auto                    is_signed_qasymm8  = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int                     min_value          = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
+    const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
+    const auto src_qinfo    = src.quantization_info().uniform();
+    const auto dst_qinfo    = dst.quantization_info().uniform();
 
-    const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size);
+    const auto axis   = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
+    const auto length = dst_shape[axis];
 
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size));
-    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
-    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
-    build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f),
-                             "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX")
-                                                                                          : std::string("-FLT_MAX")));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
-                             "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
-                             "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt),
-                              prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
-
-    cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") +
-                              (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
-
-    // Create kernel.
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+    const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;
 
-    // Configure window
-    Window win = calculate_max_window(src, Steps(reduction_dim_size));
-    IClKernel::configure_internal(win, lws_hint);
+    const auto vec_size          = adjust_vec_size(16 / element_size, dst_shape[0]);
+    const auto vec_size_leftover = dst_shape[0] % vec_size;
 
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
+    std::string    kernel_name("softmax");
+    CLBuildOptions build_opts;
 
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src,
-                                                const ITensorInfo &max,
-                                                const ITensorInfo &dst,
-                                                const ITensorInfo &sum)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
-    return Status{};
-}
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
+    build_opts.add_option_if(info.is_log, "-DIS_LOG");
+    build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));
+
+    build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
+    build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
+    build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
+    build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
+    build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
+
+    if (axis == 0)
+    {
+        kernel_name += "_x";
+        build_opts.add_option("-DSOFTMAX_X");
 
-ClLogits1DMaxShiftExpSumKernel::ParallelReductionInfo ClLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
-{
-    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
-    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
-    return std::make_tuple(is_parallel_reduction, vector_size);
-}
+        if (is_quantized)
+        {
+            _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
+        }
+    }
+    else
+    {
+        kernel_name += "_non_x";
+        build_opts.add_option("-DSOFTMAX_NON_X");
 
-void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+        TensorShape tmp_shape;
 
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto max = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_1));
+        tmp_shape.set(0, length * vec_size, false);
+        tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, max, sum);
+        for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
+        {
+            tmp_shape.set(i, dst_shape[i - 1], false);
+        }
 
-    // Collapse window in Z dimension
-    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+        for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
+        {
+            tmp_shape.set(i, dst_shape[i], false);
+        }
 
-    // Reconfigure window in case of parallel reduction
-    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if (std::get<0>(parallel_reduction_info))
-    {
-        // Launch grid_size parallel work items
-        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
+        _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
     }
 
-    // Get slices
-    Window slice = window_collapsed.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, max, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        add_3D_tensor_argument(idx, sum, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    } while (window_collapsed.slide_window_slice_3D(slice));
-}
-
-ClLogits1DNormKernel::ClLogits1DNormKernel()
-{
-    _type = CLKernelType::ELEMENTWISE;
-}
-
-void ClLogits1DNormKernel::configure(const CLCompileContext  &compile_context,
-                                     const ITensorInfo       &src,
-                                     const ITensorInfo       &sum,
-                                     ITensorInfo             &dst,
-                                     const SoftmaxKernelInfo &info)
-{
-    auto padding_info = get_padding_info({&src, &dst, &sum});
-
-    // Note: output should always have a scale of 1/256 and offset 0
-    const bool             is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType         output_data_type        = info.input_data_type;
-    const QuantizationInfo allowed_quantization_info =
-        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo = src.quantization_info().uniform();
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst,
-                       src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
+    // Configure kernel window and kernel arguments.
+    Window win = calculate_max_window(src, Steps(vec_size));
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
+    bool has_collapsed = true;
 
-    const auto         is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type);
-    const int          min_value         = is_signed_qasymm8 ? CL_SCHAR_MIN : 0;
-    const unsigned int vector_size       = adjust_vec_size(16, src.dimension(0));
+    win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
+    win = win.collapse_if_possible(win, 2, has_collapsed);
+    ARM_COMPUTE_ERROR_ON(!has_collapsed);
 
-    // Set build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type));
-    build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(src.dimension(0) % vector_size));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED");
-    build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
-    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_quantized_asymmetric, "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_quantized_asymmetric, "-DBETA=" + float_to_string_with_full_precision(info.beta));
-
-    // Create kernel
-    std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : "");
-    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-
-    // Configure window
-    auto win = calculate_max_window(src, Steps(vector_size));
     ICLKernel::configure_internal(win);
 
-    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
-}
-
-Status ClLogits1DNormKernel::validate(const ITensorInfo       &src,
-                                      const ITensorInfo       &sum,
-                                      const ITensorInfo       &dst,
-                                      const SoftmaxKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
+    _axis = axis;
 
-    return Status{};
+    _config_id = "softmax_" + lower_string(string_from_data_type(data_type));
+    _config_id += "_" + std::to_string(axis);
+    _config_id += "_" + std::to_string(length);
 }
 
-void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ICLTensor *tmp = (_tmp_info.total_size() > 0)
+                         ? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
+                         : nullptr;
 
-    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
-    auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0));
+    if (!_prepared)
+    {
+        _prepared = true;
+
+        const auto *src_info    = src->info();
+        const auto *dst_info    = dst->info();
+        auto        src_strides = src_info->strides_in_bytes();
+        auto        dst_strides = dst_info->strides_in_bytes();
+
+        const auto src_stride_axis = src_strides[_axis];
+        const auto dst_stride_axis = dst_strides[_axis];
+
+        // This axis has been removed from execution window, hence we remove it from the list of strides
+        // provided to the kernel.
+        // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
+        src_strides.remove(_axis);
+        dst_strides.remove(_axis);
+
+        // Argument 0: src_ptr.
+        _kernel.setArg<cl_uint>(1, src_strides[0]);
+        _kernel.setArg<cl_uint>(2, src_strides[1]);
+        _kernel.setArg<cl_uint>(3, src_strides[2]);
+        _kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());
+
+        // Argument 5: dst_ptr.
+        _kernel.setArg<cl_uint>(6, dst_strides[0]);
+        _kernel.setArg<cl_uint>(7, dst_strides[1]);
+        _kernel.setArg<cl_uint>(8, dst_strides[2]);
+        _kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());
+
+        if (tmp != nullptr)
+        {
+            const auto *tmp_info    = tmp->info();
+            const auto &tmp_strides = tmp_info->strides_in_bytes();
+
+            // Argument 10: tmp_ptr.
+            _kernel.setArg<cl_uint>(11, tmp_strides[1]);
+            _kernel.setArg<cl_uint>(12, tmp_strides[2]);
+            _kernel.setArg<cl_uint>(13, tmp_strides[3]);
+            _kernel.setArg<cl_uint>(14, 0);
+        }
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, sum);
+        if (_axis > 0)
+        {
+            _kernel.setArg<cl_uint>(15, src_stride_axis);
+            _kernel.setArg<cl_uint>(16, dst_stride_axis);
+        }
+    }
 
-    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
+    _kernel.setArg(0, src->cl_buffer());
+    _kernel.setArg(5, dst->cl_buffer());
 
-    do
+    if (tmp != nullptr)
     {
-        Window sum_slice = slice;
-        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        unsigned int idx = 0;
-        // Set inputs
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, sum, sum_slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    } while (window_collapsed.slide_window_slice_3D(slice));
+        _kernel.setArg(10, tmp->cl_buffer());
+    }
+
+    enqueue(queue, *this, window, lws_hint());
 }
+
+const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
+{
+    return _tmp_info;
+}
+
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
index 2dd53da346..130dc7835c 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.h
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_KERNEL_H
-#define ARM_COMPUTE_CL_SOFTMAX_KERNEL_H
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
 
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
@@ -37,94 +39,47 @@ namespace opencl
 {
 namespace kernels
 {
-/** Interface for max, shifting, exponentiating and summing the logits */
-class ClLogits1DMaxShiftExpSumKernel : public IClKernel
-{
-    /**< Grid size (obtained through auto-tuning) */
-    static const unsigned int _grid_size;
-    /**< Vector size in the serial case (obtained through auto-tuning) */
-    static const unsigned int _serial_vector_size;
-    /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
-    static const unsigned int _parallel_vector_size;
 
+/** The CL kernel that performs softmax function. */
+class ClSoftmaxKernel : public IClKernel
+{
 public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
+    ClSoftmaxKernel();
 
-    ClLogits1DMaxShiftExpSumKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel);
-    /** Configure the kernel using the given information about tensors
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     src             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p src
-     * @param[out]    dst             Destination tensor. Data types supported: same as @p src
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext  &compile_context,
-                   const ITensorInfo       &src,
-                   ITensorInfo             &max,
-                   ITensorInfo             &dst,
-                   ITensorInfo             &sum,
-                   const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure()
-     *
-     * @return a status
-     */
-    static Status
-    validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSoftmaxKernel);
+
+    /** Check if the kernel arguments are valid.
      *
-     * @param[in] size Size to check
+     * See @ref ClSoftmaxKernel::configure().
      *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
+     * @return The status.
      */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
-
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class ClLogits1DNormKernel : public IClKernel
-{
-public:
-    ClLogits1DNormKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel);
+    static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
 
-    /** Set the input and output tensors.
+    /** Configure the kernel.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  src             Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
     void configure(const CLCompileContext  &compile_context,
                    const ITensorInfo       &src,
-                   const ITensorInfo       &sum,
                    ITensorInfo             &dst,
                    const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration
-     *
-     * Similar to @ref ClLogits1DNormKernel::configure()
-     *
-     * @return a status
-     */
-    static Status
-    validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
 
-    // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+    /** Get the tensor info of the temporary tensor. */
+    const TensorInfo &tmp_tensor_info() const;
+
+private:
+    bool       _prepared{false};
+    int32_t    _axis{0};
+    TensorInfo _tmp_info{};
 };
+
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_KERNEL_H */
+#endif // ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
index 2bec400597..427f6b4f92 100644
--- a/src/gpu/cl/operators/ClSoftmax.cpp
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,14 @@
  */
 #include "src/gpu/cl/operators/ClSoftmax.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-#include "support/Cast.h"
 
 using namespace arm_compute::experimental;
 
@@ -39,17 +38,8 @@ namespace arm_compute
 {
 namespace opencl
 {
-ClSoftmax::ClSoftmax()
-    : _permute_input(std::make_unique<ClPermute>()),
-      _permute_output(std::make_unique<ClPermute>()),
-      _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()),
-      _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()),
-      _max_info(),
-      _sum_info(),
-      _tmp_info(),
-      _permuted_src_info(),
-      _permuted_dst_info(),
-      _aux_mem(InternalTensorIdx::COUNT)
+
+ClSoftmax::ClSoftmax() : _aux_mem(InternalTensorIdx::COUNT)
 {
 }
 
@@ -58,152 +48,37 @@ void ClSoftmax::configure(const CLCompileContext  &compile_context,
                           ITensorInfo             &dst,
                           const SoftmaxKernelInfo &info)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
 
-    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-
-    _needs_permute = actual_axis != 0;
-
-    const ITensorInfo &tmp_input_info  = _needs_permute ? _permuted_src_info : src;
-    ITensorInfo       &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
-
-    if (_needs_permute)
-    {
-        const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
-    }
-
-    DataType tmp_data_type =
-        is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
-    _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type);
-
-    TensorShape max_sum_shape = tmp_input_info.tensor_shape();
-    _max_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
-    _sum_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type);
+    auto k = std::make_unique<kernels::ClSoftmaxKernel>();
+    k->configure(compile_context, src, dst, info);
 
-    // Set GPU target to kernels
-    _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target());
+    _tmp_info = k->tmp_tensor_info();
 
-    _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
-    _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
+    _kernel = std::move(k);
 
-    if (_needs_permute)
-    {
-        const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
-    }
-
-    _aux_mem[InternalTensorIdx::SUM] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
     _aux_mem[InternalTensorIdx::TMP] =
         MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
-    _aux_mem[InternalTensorIdx::MAX] =
-        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
-
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
-                                                           MemoryLifetime::Temporary, _permuted_src_info.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
-                                                           MemoryLifetime::Temporary, _permuted_dst_info.total_size());
 }
 
 Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_UNUSED(info.beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
-                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
-
-    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
-    const bool   needs_permute = actual_axis != 0;
-    if (needs_permute)
-    {
-        const PermutationVector permutation_vector =
-            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape permuted_shape =
-            misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
-        TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
-        TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
-    }
-
-    // Create intermediate tensor info
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type();
-    TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true));
-
-    TensorShape max_sum_shape = src.tensor_shape();
-    max_sum_shape.set(0, 1);
-    TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(src.clone()
-                                   ->set_tensor_shape(max_sum_shape)
-                                   .set_data_type(tmp_data_type)
-                                   .set_quantization_info(QuantizationInfo())
-                                   .set_is_resizable(true));
-
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
-
-    return Status{};
+    return kernels::ClSoftmaxKernel::validate(src, dst, info);
 }
 
 void ClSoftmax::run(ITensorPack &tensors)
 {
-    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
-    auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
-    CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false);
-    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
-    CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
-
-    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors,
-                                    false);
-    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors,
-                                    false);
-
-    if (_needs_permute)
-    {
-        ITensorPack pack;
-        pack.add_const_tensor(TensorType::ACL_SRC, src);
-        pack.add_tensor(TensorType::ACL_DST, permuted_src.get());
-        _permute_input.get()->run(pack);
-    }
-
-    ITensorPack sum_pack;
-    ITensorPack norm_pack;
-    if (_needs_permute)
-    {
-        sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
-        norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
-    }
-    else
-    {
-        sum_pack.add_const_tensor(TensorType::ACL_SRC, src);
-        norm_pack.add_tensor(TensorType::ACL_DST, dst);
-    }
-    sum_pack.add_tensor(TensorType::ACL_DST, tmp.get());
-    sum_pack.add_tensor(TensorType::ACL_INT_0, max.get());
-    sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get());
-
-    norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get());
-    norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get());
-
-    CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
-    CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
-
-    if (_needs_permute)
-    {
-        ITensorPack pack;
-        pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
-        pack.add_tensor(TensorType::ACL_DST, dst);
-        _permute_output.get()->run(pack);
-    }
+    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors);
+
+    tensors.add_tensor(TensorType::ACL_INT_0, tmp.get());
+
+    CLScheduler::get().enqueue_op(*_kernel, tensors, false);
 }
 
 experimental::MemoryRequirements ClSoftmax::workspace() const
 {
     return _aux_mem;
 }
+
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
index 6c2aaaea80..232fcfebd1 100644
--- a/src/gpu/cl/operators/ClSoftmax.h
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,25 +21,26 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CL_SOFTMAX_H
-#define ARM_COMPUTE_CL_SOFTMAX_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 
-#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class ITensorInfo;
+class ITensorPack;
 struct SoftmaxKernelInfo;
 
 namespace opencl
 {
-class ClPermute;
 namespace kernels
 {
-class ClLogits1DMaxShiftExpSumKernel;
-class ClLogits1DNormKernel;
+class ClSoftmaxKernel;
 } // namespace kernels
 class ClSoftmax : public IClOperator
 {
@@ -64,36 +65,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
-    // Inherited methods overridden:
-    void                             run(ITensorPack &tensors) override;
+
+    void run(ITensorPack &tensors) override;
+
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum InternalTensorIdx
     {
-        MAX = 0,
-        SUM,
-        TMP,
-        PERMUTED_SRC,
-        PERMUTED_DST,
-        COUNT
+        TMP = 0,
+        COUNT,
     };
 
-    std::unique_ptr<ClPermute>                               _permute_input;
-    std::unique_ptr<ClPermute>                               _permute_output;
-    std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
-    std::unique_ptr<kernels::ClLogits1DNormKernel>           _norm_kernel;
-    bool                                                     _needs_permute{false};
-
-    TensorInfo _max_info;
-    TensorInfo _sum_info;
-    TensorInfo _tmp_info;
-    TensorInfo _permuted_src_info;
-    TensorInfo _permuted_dst_info;
-
-    experimental::MemoryRequirements _aux_mem{};
+    TensorInfo                       _tmp_info{};
+    experimental::MemoryRequirements _aux_mem;
 };
 
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-10-13 17:40:32 +0100
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-10-31 10:16:25 +0000
commit	29254aeb11a76c86449c2f38587e9144b2f2aacb (patch)
tree	ca2df26e81c2417b34768ac325e0f7200b5265df
parent	e5362e7e5dbccf81c5296a7e77154e11e1a14d2f (diff)
download	ComputeLibrary-29254aeb11a76c86449c2f38587e9144b2f2aacb.tar.gz