COMPMID-3694 COMPMID-3695 COMPMID-3458: Softmax Axis

* Properly support "axis" in CL and NEON (and GC) SoftmaxLayer and LogSoftmaxLayer in accord with mainstream frameworks. Axis now defines the dimension on which softmax is performed, and supports the range [-rank, rank) * Extend validation tests to include valid and invalid axes * Remove unnecessary LogSoftmaxLayer fixture, as it is only a specialisation of the SoftmaxLayer fixture * Change the validation fill value range from [-1000, 1000] to [-10, 10], as the former often results in sparse outputs with a single one and zeros elsewhere Change-Id: I8a0040453182b04ed88260de3ba434e98258d863 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3830 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: SiCong Li <sicong.li@arm.com> 2020-08-21 12:28:30 +0100
committer: SiCong Li <sicong.li@arm.com> 2020-08-25 14:12:07 +0000
commit: 96209c73b071bb65d4919fb441076f977095a31b (patch)
tree: 50252f1a33992b3a6171c6b2becf6da1b6f0022d
parent: 5111264954e2d1a4d3e91d23a0869a0d7105be4c (diff)
download: ComputeLibrary-96209c73b071bb65d4919fb441076f977095a31b.tar.gz
21 files changed, 286 insertions, 519 deletions
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 48ac38b170..90dd6082e1 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -801,16 +801,6 @@ inline T wrap_around(T x, T m)
     return x >= 0 ? x % m : (x % m + m) % m;
 }
 
-/** Convert a dimension axis to the number of dimensions in the range [0, @p dim_axis]
- * Handle negative axis, negative axis is used to specify axis from the end (e.g. -1 for the last axis).
- *
- * @param[in] dim_axis The last axis (inclusive) in the range [0, @p dim_axis]
- * @param[in] num_dims The total number of dimensions
- *
- * @return The number of dimensions in the range [0, @p dim_axis]
- */
-inline size_t dim_index_2_num_dims(int32_t dim_axis, int32_t num_dims);
-
 /** Convert negative coordinates to positive in the range [0, num_dims_input]
  *
  * @param[out] coords    Array of coordinates to be converted.
@@ -852,6 +842,21 @@ inline unsigned int get_next_power_two(unsigned int x)
 
     return x;
 }
+
+/** Given a softmax axis, this function returns the permutation vector required to put the axis to the front
+ *
+ * @note This function assumes a tensor rank <= 4
+ *
+ * Axis selects the dimension on which softmax is performed.
+ * E.g. For input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
+ * Interally softmax kernels is always performed on the first dimension (front dimension), therefore permutation is
+ * required to put the dimension specified by @p axis to the first dimension.
+ *
+ * @param[in] axis Axis on which to perform softmax. Supported: 1, 2, 3 (0 implies no permutation needed)
+ *
+ * @return the permutation vector
+ */
+PermutationVector get_permutation_vector_from_softmax_axis(size_t axis);
 } // namespace arm_compute
 
 #include "arm_compute/core/Helpers.inl"
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index df0c929372..5613e8c74e 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -29,11 +29,6 @@
 
 namespace arm_compute
 {
-inline size_t dim_index_2_num_dims(int32_t dim_axis, int32_t num_dims)
-{
-    return static_cast<size_t>(wrap_around(dim_axis, num_dims)) + 1;
-}
-
 inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index bb01584ff4..fd71f3ed4d 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -26,8 +26,7 @@
 
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -47,7 +46,10 @@ class ICLTensor;
  * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  *
  * This function runs the following kernels:
+ * -# If axis is not 0:
+ * -#   @ref CLPermute
  * -# @ref CLLogits1DNormKernel
+ * -# @ref CLLogits1DMaxShiftExpSumKernel
  */
 template <bool IS_LOG = false>
 class CLSoftmaxLayerGeneric : public IFunction
@@ -60,70 +62,47 @@ public:
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in]  axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 0);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output          Destination tensor. Data types supported: same as @p input
      * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis            (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in]  axis            (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 0);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
      * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in] axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 0);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] input  Original source tensor.
-     * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
-     */
-    void configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis);
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Original source tensor.
-     * @param[in] output          Original destination tensor.
-     * @param[in] axis            (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
-     */
-    void configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis);
-
     MemoryGroup                    _memory_group;
+    CLPermute                      _permute_input;
+    CLPermute                      _permute_output;
     CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
     CLLogits1DNormKernel           _norm_kernel;
-    std::unique_ptr<IFunction>     _flatten_ptr;
-    CLReshapeLayer                 _reshape;
     CLTensor                       _max;
     CLTensor                       _sum;
     CLTensor                       _tmp;
-    CLTensor                       _input_flattened;
-    CLTensor                       _output_flattened;
-    bool                           _needs_flattening;
+    CLTensor                       _input_permuted;
+    CLTensor                       _output_permuted;
+    bool                           _needs_permute;
 };
 
 using CLSoftmaxLayer    = CLSoftmaxLayerGeneric<false>;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
index 4ccfe2684e..0279edf63d 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
@@ -50,17 +50,15 @@ public:
     GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input           Source tensor. Data types supported: F16/F32
-     * @param[out] output          Destination tensor. Data types supported: same as @p input
-     * @param[in]  beta            (Optional) A scaling factor for the exponent. Only beta = 1 is supported
-     * @param[in]  reduce_end_axis (Optional) The last axis of the first n dimensions (inclusive)to reduce. Defaults to 0.
-     *                   It has the purpose of squashing together the first n dimensions till (including) the @p reduce_end_axis. For instance, given a [2x3x4x5] image,
-     *                   when @p reduce_end_axis is 1, the reduction will be applied to axes 0 and 1, and the Softmax op will be applied on each of the [2x3] planes of the input image.
-     *                   Must be in range [0, input_num_dimensions).
+     * @param[in]  input  Source tensor. Data types supported: F16/F32
+     * @param[out] output Destination tensor. Data types supported: same as @p input
+     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported
+     * @param[in]  axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
-     * @note The value of @p reduce_end_axis must be always 0 for GLES
+     * @note The value of @p axis must be always 0 for GLES
      */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, size_t reduce_end_axis = 0);
+    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 9fb4d85262..20b20201d5 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -28,8 +28,7 @@
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -44,7 +43,9 @@ class ITensor;
  * Log Softmax is calculated by :
  * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  *
- * This function runs the following kernels:
+ * This function runs the following function/kernels:
+ * -# If axis is not 0:
+ * -#   @ref NEPermute
  * -# @ref NEFillBorderKernel
  * -# @ref NELogits1DMaxKernel
  * -# @ref NELogits1DSoftmaxKernel
@@ -70,7 +71,8 @@ public:
      *                       last value of each row to the nearest multiple.
      * @param[out]    output Destination tensor. Data types supported: same as @p input.
      * @param[in]     beta   (Optional) A scaling factor for the exponent.
-     * @param[in]     axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
     void configure(ITensor *input, ITensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
@@ -78,7 +80,8 @@ public:
      * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output Destination tensor info. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent.
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     * @param[in] axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
      * @return a status
      */
@@ -88,30 +91,17 @@ public:
     void run() override;
 
 private:
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] input  Original source tensor.
-     * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
-     */
-    void configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis);
-
     MemoryGroup                     _memory_group;
+    NEPermute                       _permute_input;
+    NEPermute                       _permute_output;
     NELogits1DMaxKernel             _max_kernel;
     NELogits1DSoftmaxKernel<IS_LOG> _softmax_kernel;
-    std::unique_ptr<IFunction>      _flat_or_reshape_ptr;
     NEFillBorderKernel              _fill_border_kernel;
-    NEReshapeLayer                  _reshape;
     Tensor                          _max;
     Tensor                          _tmp;
-    Tensor                          _input_flattened;
-    Tensor                          _output_flattened;
-    bool                            _needs_flattening;
+    Tensor                          _input_permuted;
+    Tensor                          _output_permuted;
+    bool                            _needs_permute;
 };
 
 using NESoftmaxLayer    = NESoftmaxLayerGeneric<false>;
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index bb1dfec69e..37c39f50a4 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -237,6 +237,18 @@ If there is more than one release in a month then an extra sequential number is
 
 @subsection S2_2_changelog Changelog
 
+v20.11 Public major release
+ - Interface change
+   - Properly support softmax axis to have the same meaning as other major frameworks. That is, axis now defines the dimension
+     on which Softmax/Logsoftmax is performed. E.g. for input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
+     The supported value range of axis is [-rank, rank).
+     This change applies to the following functions:
+      - @ref NESoftmaxLayer
+      - @ref NELogSoftmaxLayer
+      - @ref CLSoftmaxLayer
+      - @ref CLLogSoftmaxLayer
+      - @ref GCSoftmaxLayer
+
 v20.08 Public major release
  - Various bug fixes.
  - Various optimisations.
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index 1636e5a1bc..dc2d6fe4b4 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -75,16 +75,16 @@ void CLPermuteKernel::configure(const ICLTensor *input, ICLTensor *output, const
 void CLPermuteKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    const TensorShape output_shape = get_output_shape(input->info(), perm);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
 
     _input  = input;
     _output = output;
     _perm   = perm;
 
-    const TensorShape output_shape = get_output_shape(input->info(), perm);
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index bfc4a8d101..5c7200b35c 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,9 @@
  */
 #include "arm_compute/core/Helpers.h"
 
-using namespace arm_compute;
-
-Window arm_compute::calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+namespace arm_compute
+{
+Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
     if(!skip_border)
     {
@@ -79,7 +79,7 @@ Window arm_compute::calculate_max_window(const ValidRegion &valid_region, const
     return window;
 }
 
-Window arm_compute::calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
+Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
 {
     const Coordinates &anchor = valid_region.anchor;
     const TensorShape &shape  = valid_region.shape;
@@ -128,7 +128,7 @@ Window arm_compute::calculate_max_enlarged_window(const ValidRegion &valid_regio
     return window;
 }
 
-Window arm_compute::calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
     if(skip_border)
     {
@@ -181,8 +181,8 @@ Window arm_compute::calculate_max_window_horizontal(const ValidRegion &valid_reg
     return window;
 }
 
-ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                                      InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
+                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
 {
     const DataLayout data_layout = src_info.data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -255,4 +255,20 @@ ValidRegion arm_compute::calculate_valid_region_scale(const ITensorInfo &src_inf
     valid_region.shape.set(idx_height, std::min<size_t>(valid_end_out_y - valid_start_out_y, dst_shape[idx_height]));
 
     return valid_region;
-}
-\ No newline at end of file
+}
+
+PermutationVector get_permutation_vector_from_softmax_axis(size_t actual_axis)
+{
+    switch(actual_axis)
+    {
+        case 1:
+            return PermutationVector(1U, 0U, 2U, 3U);
+        case 2:
+            return PermutationVector(2U, 1U, 0U, 3U);
+        case 3:
+            return PermutationVector(3U, 1U, 2U, 0U);
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index f7b2935622..720f9111a5 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -36,96 +36,45 @@ namespace arm_compute
 {
 template <bool IS_LOG>
 CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_ptr(), _reshape(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
-      _needs_flattening(false)
+    : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _input_permuted(), _output_permuted(),
+      _needs_permute()
 {
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t first_n_reduce_axes)
-{
-    configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, first_n_reduce_axes);
-}
-
-template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t first_n_reduce_axes)
-{
-    // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), first_n_reduce_axes);
-
-    // Initialize the flat input
-    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
-    // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
-    // If the number of reduced axes is 3 (max dimension), which means collapsing all axes except the batch axis, we use CLFlattenKernel.
-    // In all other cases we have to use CLReshapeKernel
-    // Note that the "other cases" include both:
-    //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
-    //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by CLReshapeKernel instead of CLFlattenKernel.
-    if(first_n_reduce_axes == 3)
-    {
-        auto flatten = support::cpp14::make_unique<CLFlattenLayer>();
-        flatten->configure(compile_context, input, &_input_flattened);
-        _flatten_ptr = std::move(flatten);
-    }
-    else
-    {
-        auto reshape_ptr = support::cpp14::make_unique<CLReshapeLayer>();
-        reshape_ptr->configure(compile_context, input, &_input_flattened);
-        _flatten_ptr = std::move(reshape_ptr);
-    }
-
-    // We need to init the output tensor here. Indeed, the reshape kernel expects
-    // both tensors to be already initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-}
-
-template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, beta, axis);
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, input->info()->num_dimensions());
-
-    // We only need flattening when the number of axes to reduce is greater than 1
-    _needs_flattening = first_n_reduce_axes > 1;
+    const size_t actual_axis = static_cast<size_t>(wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions())));
 
-    // If we are dealing with a 4D tensor, we will:
-    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
-    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
-    // - Reshape the flattened output into the real output
-    if(_needs_flattening)
+    _needs_permute              = actual_axis != 0;
+    ICLTensor       *tmp_output = output;
+    const ICLTensor *tmp_input  = _needs_permute ? &_input_permuted : input;
+    if(_needs_permute)
     {
-        // Add to the memory manager _input_flattened
-        _memory_group.manage(&_input_flattened);
-
-        // Cofigure _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, first_n_reduce_axes);
+        _memory_group.manage(&_input_permuted);
+        _memory_group.manage(&_output_permuted);
+        _permute_input.configure(compile_context, input, &_input_permuted, get_permutation_vector_from_softmax_axis(actual_axis));
+        tmp_output = &_output_permuted;
     }
 
-    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
-    // or it is the original input case (2D case)
-    const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input);
-
-    // Create intermediate tensors shapes
-    TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
-    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    // Create intermediate tensors
+    DataType   tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->info()->data_type()) ? DataType::S32 : tmp_input->info()->data_type();
+    TensorInfo tensor_info_tmp(tmp_input->info()->clone()->set_data_type(tmp_data_type));
     _tmp.allocator()->init(tensor_info_tmp);
-
-    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+    TensorShape max_sum_shape = tmp_input->info()->tensor_shape();
     max_sum_shape.set(0, 1);
-    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
-    _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
+    _max.allocator()->init(tmp_input->info()->clone()->set_tensor_shape(max_sum_shape));
+    _sum.allocator()->init(tmp_input->info()->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
 
     // Set GPU target to kernels
     _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
@@ -138,49 +87,43 @@ void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_co
     SoftmaxKernelInfo softmax_info;
     softmax_info.beta            = beta;
     softmax_info.is_log          = IS_LOG;
-    softmax_info.input_data_type = input_2D->info()->data_type();
+    softmax_info.input_data_type = tmp_input->info()->data_type();
 
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(compile_context, input_2D, &_max, &_tmp, &_sum, softmax_info);
-
-    if(_needs_flattening)
-    {
-        // Add to the memory manager _output_flattened
-        _memory_group.manage(&_output_flattened);
-
-        // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info);
-
-        // Reshape the flat output into a the requested (4D) output
-        _reshape.configure(compile_context, &_output_flattened, output);
-
-        // Allocate the intermediate flat tensors
-        _input_flattened.allocator()->allocate();
-        _output_flattened.allocator()->allocate();
-    }
-    else
-    {
-        // Softmax 2D case
-        _norm_kernel.configure(compile_context, &_tmp, &_sum, output, softmax_info);
-    }
+    _max_shift_exp_sum_kernel.configure(compile_context, tmp_input, &_max, &_tmp, &_sum, softmax_info);
+    _norm_kernel.configure(compile_context, &_tmp, &_sum, tmp_output, softmax_info);
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
     _max.allocator()->allocate();
     _sum.allocator()->allocate();
+    if(_needs_permute)
+    {
+        _permute_output.configure(compile_context, &_output_permuted, output, get_permutation_vector_from_softmax_axis(actual_axis));
+        _input_permuted.allocator()->allocate();
+        _output_permuted.allocator()->allocate();
+    }
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 0, "Only axis 0 supported in tensors");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() <= axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis);
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, input->num_dimensions());
+    const size_t actual_axis   = static_cast<size_t>(wrap_around(axis, static_cast<int32_t>(input->num_dimensions())));
+    const bool   needs_permute = actual_axis != 0;
+    if(needs_permute)
+    {
+        const PermutationVector permutation_vector = get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
+        TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &input_permuted, permutation_vector));
+        TensorInfo output_permuted(output->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&output_permuted, output, permutation_vector));
+    }
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -191,23 +134,6 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
-    const bool needs_flattening = (first_n_reduce_axes > 1);
-
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, first_n_reduce_axes);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-
-        if(first_n_reduce_axes == 3)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &tensor_info_flat));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(input, &tensor_info_flat));
-        }
-    }
-
     SoftmaxKernelInfo softmax_info;
     softmax_info.beta            = beta;
     softmax_info.is_log          = IS_LOG;
@@ -216,12 +142,6 @@ Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output, softmax_info));
 
-    if(needs_flattening)
-    {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-    }
-
     return Status{};
 }
 
@@ -230,17 +150,17 @@ void           CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _flatten_ptr->run();
+        _permute_input.run();
     }
 
     CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
+    CLScheduler::get().enqueue(_norm_kernel, !_needs_permute);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _reshape.run();
+        _permute_output.run();
     }
 }
 
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 48d8cb576b..fdb9a42f13 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -34,13 +34,13 @@ GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t reduce_end_axis)
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, int32_t axis)
 {
-    ARM_COMPUTE_UNUSED(beta, reduce_end_axis);
+    ARM_COMPUTE_UNUSED(beta, axis);
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(beta != 1.0f);
-    ARM_COMPUTE_ERROR_ON_MSG(reduce_end_axis != 0, "Reduce_end_axis must be 0 for GLES");
+    ARM_COMPUTE_ERROR_ON_MSG(axis != 0, "axis must be 0 for GLES");
 
     // Create intermediate tensors shapes
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 750992fca6..e763caa3a3 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,78 +32,41 @@ namespace arm_compute
 {
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_ptr(nullptr), _fill_border_kernel(), _reshape(), _max(), _tmp(), _input_flattened(), _output_flattened(),
-      _needs_flattening(false)
+    : _memory_group(std::move(memory_manager)), _permute_input(), _permute_output(), _max_kernel(), _softmax_kernel(), _fill_border_kernel(), _max(), _tmp(), _input_permuted(), _output_permuted(),
+      _needs_permute(false)
 {
 }
 
 template <bool IS_LOG>
-void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t first_n_reduce_axes)
-{
-    // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), first_n_reduce_axes);
-
-    // Initialize the flat input
-    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
-    // Note that the "other cases" include both:
-    //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
-    //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by NEReshapeKernel instead of NEFlattenKernel.
-    if(first_n_reduce_axes == 3)
-    {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayer>();
-        flatten_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_ptr = std::move(flatten_kernel_ptr);
-    }
-    else
-    {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayer>();
-        reshape_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_ptr = std::move(reshape_kernel_ptr);
-    }
-
-    // We need to init the output tensor here. Indeed, the reshape kernel expects
-    // both tensors to be already initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-}
-
-template <bool IS_LOG>
 void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, int32_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, static_cast<int32_t>(input->info()->num_dimensions()));
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions())));
 
-    // We only need flattening when the number of axes to reduce is greater than 1
-    _needs_flattening = first_n_reduce_axes > 1;
+    _needs_permute = actual_axis > 0;
 
-    // If we are dealing with a 4D tensor, we will:
-    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
-    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
-    // - Reshape the flattened output into the real output
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        // Add to the memory manager _input_flattened
-        _memory_group.manage(&_input_flattened);
+        // Add to the memory manager _input_permuted
+        _memory_group.manage(&_input_permuted);
 
-        // Configure  _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, first_n_reduce_axes);
+        _permute_input.configure(input, &_input_permuted, get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
-    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+    // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
     // or it is the original input case (2D case)
-    ITensor *input_2D = (_needs_flattening ? &_input_flattened : input);
+    ITensor *tmp_input = (_needs_permute ? &_input_permuted : input);
 
     // Create intermediate tensors shapes
-    const TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::F32 : input_2D->info()->data_type();
+    const TensorInfo input_info    = tmp_input->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->info()->data_type()) ? DataType::F32 : tmp_input->info()->data_type();
     TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
 
     // Init intermediate tensors
-    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
+    TensorShape max_sum_shape = tmp_input->info()->tensor_shape();
     max_sum_shape.set(0, 1);
     _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
     _tmp.allocator()->init(tensor_info_tmp);
@@ -113,27 +76,27 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
     _memory_group.manage(&_tmp);
 
     // Configure Kernels
-    _max_kernel.configure(input_2D, &_max);
-    if(_needs_flattening)
+    _max_kernel.configure(tmp_input, &_max);
+    if(_needs_permute)
     {
-        // Add to the memory manager _output_flattened
-        _memory_group.manage(&_output_flattened);
+        // Add to the memory manager _output_permuted
+        _memory_group.manage(&_output_permuted);
 
-        // The normalization kernel stores the result in a flat output tensor
-        _softmax_kernel.configure(input_2D, &_max, &_output_flattened, beta, &_tmp);
-        _input_flattened.allocator()->allocate();
+        // The normalization kernel stores the result in a permuted output tensor
+        _softmax_kernel.configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+        _input_permuted.allocator()->allocate();
 
-        // Reshape the flat output into the requested (4D) output
-        _reshape.configure(&_output_flattened, output);
+        // Re-permute the permuted output into the requested (4D) output
+        _permute_output.configure(&_output_permuted, output, get_permutation_vector_from_softmax_axis(actual_axis));
 
-        // Allocate the intermediate flat tensors
-        _output_flattened.allocator()->allocate();
+        // Allocate the intermediate permuted tensors
+        _output_permuted.allocator()->allocate();
     }
     else
     {
         // Softmax 2D case
-        _fill_border_kernel.configure(input_2D, _max_kernel.border_size(), BorderMode::REPLICATE);
-        _softmax_kernel.configure(input_2D, &_max, output, beta, &_tmp);
+        _fill_border_kernel.configure(tmp_input, _max_kernel.border_size(), BorderMode::REPLICATE);
+        _softmax_kernel.configure(tmp_input, &_max, output, beta, &_tmp);
     }
 
     // Allocate intermediate buffers
@@ -148,12 +111,8 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 0, "Only axis 0 supported");
     ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis);
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, static_cast<int32_t>(input->num_dimensions()));
-
     // Create intermediate tensor info
     DataType         tmp_data_type = input->data_type();
     const TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
@@ -163,21 +122,18 @@ Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const I
     const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
     const TensorInfo dont_care;
 
-    const bool needs_flattening = (first_n_reduce_axes > 1);
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(input->num_dimensions())));
+
+    const bool needs_permute = actual_axis > 0;
 
-    if(needs_flattening)
+    if(needs_permute)
     {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, first_n_reduce_axes);
-        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
-
-        if(first_n_reduce_axes == 3)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &tensor_info_flat));
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(input, &tensor_info_flat));
-        }
+        const PermutationVector permutation_vector = get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*input, permutation_vector);
+        TensorInfo              input_permuted(input->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(input, &input_permuted, permutation_vector));
+        TensorInfo output_permuted(output->clone()->set_tensor_shape(permuted_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(&output_permuted, output, permutation_vector));
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
@@ -191,18 +147,18 @@ void           NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _flat_or_reshape_ptr->run();
+        _permute_input.run();
     }
 
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
     NEScheduler::get().schedule(&_max_kernel, Window::DimY);
     NEScheduler::get().schedule(&_softmax_kernel, Window::DimY);
 
-    if(_needs_flattening)
+    if(_needs_permute)
     {
-        _reshape.run();
+        _permute_output.run();
     }
 }
 
diff --git a/tests/validation/CL/LogSoftmaxLayer.cpp b/tests/validation/CL/LogSoftmaxLayer.cpp
index 15466affc4..8fdc745d13 100644
--- a/tests/validation/CL/LogSoftmaxLayer.cpp
+++ b/tests/validation/CL/LogSoftmaxLayer.cpp
@@ -59,7 +59,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                      framework::dataset::make("Axis", { 0 })))
+                                                                                                      framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -75,7 +75,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerFixture<half>, framework::Data
 FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+                                                                                                       framework::dataset::make("Axis", { 0, -3, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -86,7 +86,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 0 })))
+                                                                                                       framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -99,10 +99,10 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerFixture<float>, framework::Dat
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
-                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 0 })))
+FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
+                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                    framework::dataset::make("Axis", { 0, -4, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 90c3058c5d..0e4952b02a 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -129,9 +129,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                                   QuantizationInfo(1.f/256, 0)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis high
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis low
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
                                                      })),
                framework::dataset::make("beta", { 1.0,
@@ -151,11 +151,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                   0,
                                                   0,
                                                   0,
+                                                  1,
                                                   0,
-                                                  0,
-                                                  0,
-                                                  2,
                                                   -1,
+                                                  2,
+                                                  -3,
                                                 })),
                framework::dataset::make("Expected", { false, false, false, false, false, true, true, true, false, false })),
                input_info, output_info, beta, axis, expected)
@@ -173,7 +173,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })))
+                                                                                                   framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -189,7 +189,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::Dataset
 FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                    framework::dataset::make("Axis", { 0, -1, 2 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -200,7 +200,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                    framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -216,7 +216,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::Datase
 FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
                                                                                                              framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                     framework::dataset::make("Axis", { 0 })))
+                                                                                                     framework::dataset::make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -233,7 +233,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<uint8_t>, framew
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                               framework::dataset::make("Axis", { 0 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -251,7 +251,7 @@ FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                framework::dataset::make("Axis", { 0 })))
+                                                                                                                framework::dataset::make("Axis", { 0, -4, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -265,7 +265,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerQuantizedFixture<int8_t>, framewo
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                              framework::dataset::make("Axis", { 0 })))
+                                                                                                              framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
index af92cff813..2a8a33eaf7 100644
--- a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
@@ -89,7 +89,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
                                                                                                                      framework::dataset::make("Beta", 1.0f)),
-                                                                                                                     framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                                     framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
@@ -97,18 +97,18 @@ FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framew
 FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    framework::dataset::make("Beta", 1.0f)),
-                                                                                                                   framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                                   framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
                                                                                                                   framework::dataset::make("Beta", 1.0f)),
-                                                                                                          framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                          framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
@@ -116,16 +116,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::Datase
 FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::F32)),
                                                                                                                 framework::dataset::make("Beta", 1.0f)),
-                                                                                                        framework::dataset::make("ReduceEndAxis", 0)))
+                                                                                                        framework::dataset::make("Axis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // SoftmaxLayer
+TEST_SUITE_END() // GC
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp
index 3f85e3f7a2..a7ab033359 100644
--- a/tests/validation/NEON/LogSoftmaxLayer.cpp
+++ b/tests/validation/NEON/LogSoftmaxLayer.cpp
@@ -71,7 +71,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -79,7 +79,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::Data
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                              framework::dataset::make("Axis", { 0 })))
+                                                                                                              framework::dataset::make("Axis", { 0, -3, 2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -99,7 +99,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 0 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -107,7 +107,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::D
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 0 })))
+                                                                                                               framework::dataset::make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -132,7 +132,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, f
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                                    framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -141,7 +141,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, f
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 0 })))
+                                                                                                                    framework::dataset::make("Axis", { 0, -1, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 70203d9ce9..2a9e30604e 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -73,6 +73,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 12)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
                                                                   QuantizationInfo(1.f/256, 12)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
@@ -85,6 +86,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 0)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 0)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
@@ -95,18 +97,20 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                   1.0,
                                                   2.0,
                                                   1.0,
+                                                  1.0,
                                                   2.0,
                                                   1.0,
                                                 })),
                framework::dataset::make("axis", { 0,
                                                   0,
                                                   0,
+                                                  1,
                                                   0,
-                                                  0,
+                                                  -1,
                                                   2,
                                                   -3,
                                                 })),
-               framework::dataset::make("Expected", { false, false, false, true, true, false, false })),
+               framework::dataset::make("Expected", { false, false, false, true, true, true, false, false })),
                input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
@@ -123,7 +127,7 @@ TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
                                                                                                                  framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 0 })))
+                                                                                                         framework::dataset::make("Axis", { 0, 1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -131,7 +135,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::Dataset
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 0 })))
+                                                                                                           framework::dataset::make("Axis", { 0, 2, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -151,7 +155,7 @@ TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -159,7 +163,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::Data
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 0 })))
+                                                                                                            framework::dataset::make("Axis", { 0, -2, 3 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -184,7 +188,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<uint8_t>, fram
                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0 })))
+                                                                                                                 framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -193,7 +197,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<uint8_t>, fram
                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 0 })))
+                                                                                                                 framework::dataset::make("Axis", { 0, 1, -2 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -214,7 +218,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerQuantizedFixture<int8_t>, frame
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0 })))
+                                                                                                                framework::dataset::make("Axis", { 0, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
@@ -223,7 +227,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerQuantizedFixture<int8_t>, frame
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { 0 })))
+                                                                                                                framework::dataset::make("Axis", { 0, 1, -1 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index 29a3ed2cd0..30356d648d 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -32,7 +32,6 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
-#include "tests/validation/reference/LogSoftmaxLayer.h"
 #include "tests/validation/reference/SoftmaxLayer.h"
 
 #include <random>
@@ -52,8 +51,8 @@ public:
     {
         _quantization_info = quantization_info;
 
-        _target    = compute_target(shape, data_type, quantization_info, beta, axis);
         _reference = compute_reference(shape, data_type, quantization_info, beta, axis);
+        _target    = compute_target(shape, data_type, quantization_info, beta, axis);
     }
 
 protected:
@@ -62,7 +61,7 @@ protected:
     {
         if(!is_data_type_quantized(tensor.data_type()))
         {
-            std::uniform_real_distribution<> distribution(-1000.f, 1000.f);
+            std::uniform_real_distribution<> distribution(-10.f, 10.f);
             library->fill(tensor, distribution, 0);
         }
         else // data type is quantized_asymmetric (signed or unsigned)
@@ -111,14 +110,7 @@ protected:
         // Fill reference
         fill(src);
 
-        if(IS_LOG)
-        {
-            return reference::log_softmax_layer<T>(src, beta, axis);
-        }
-        else
-        {
-            return reference::softmax_layer<T>(src, beta, axis);
-        }
+        return reference::softmax_layer<T>(src, beta, axis, IS_LOG);
     }
 
     TensorType       _target{};
@@ -155,6 +147,7 @@ public:
                                                                                                   axis);
     }
 };
+
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/LogSoftmaxLayer.cpp b/tests/validation/reference/LogSoftmaxLayer.cpp
deleted file mode 100644
index 8d3b8f7579..0000000000
--- a/tests/validation/reference/LogSoftmaxLayer.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "LogSoftmaxLayer.h"
-#include "SoftmaxLayer.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
-{
-    return softmax_layer_generic<T>(src, beta, reduce_end_axis, true);
-}
-
-template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
-{
-    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), true);
-
-    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = log_softmax_layer<float>(src_tmp, beta, reduce_end_axis);
-    SimpleTensor<T>     dst     = convert_to_asymmetric<T>(dst_tmp, output_quantization_info);
-    return dst;
-}
-
-template SimpleTensor<float> log_softmax_layer(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<half> log_softmax_layer(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<uint8_t> log_softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<int8_t> log_softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t reduce_end_axis);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
diff --git a/tests/validation/reference/LogSoftmaxLayer.h b/tests/validation/reference/LogSoftmaxLayer.h
deleted file mode 100644
index db945074a2..0000000000
--- a/tests/validation/reference/LogSoftmaxLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H
-#define ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H
-
-#include "tests/SimpleTensor.h"
-#include "tests/validation/Helpers.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-namespace reference
-{
-template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
-
-template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type = 0 >
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
-} // namespace reference
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_SOFTMAX_LAYER_H */
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index 00206766f8..3fbac32a9b 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "utils/TypePrinter.h"
 
 namespace arm_compute
 {
@@ -35,39 +36,43 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis, bool is_log)
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
-    // Convert reduce-before axis (inclusive) to first n axes to reduce
-    const size_t first_n_reduce_axes = dim_index_2_num_dims(reduce_end_axis, static_cast<int32_t>(src.shape().num_dimensions()));
+    const int32_t n_dims = static_cast<int32_t>(src.shape().num_dimensions());
+    ARM_COMPUTE_ERROR_ON(axis < -n_dims || axis >= n_dims);
 
-    // Compute reference. Lower dims are the collapsing of the first axis
-    // dimensions (i.e., the flattened dimension of each batch). The upper dims are
-    // instead the batches we want to normalize
+    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, n_dims));
+    Window             window;
+    window.use_tensor_dimensions(src.shape());
+    const unsigned int axis_dimension = src.shape()[actual_axis];
+    window.set(actual_axis, Window::Dimension(0, 1, 1));
 
-    const int lower_dims = src.shape().total_size_lower(first_n_reduce_axes);
-
-    const int upper_dims = src.shape().total_size_upper(first_n_reduce_axes);
-
-#if defined(_OPENMP)
-    #pragma omp parallel for
-#endif /* _OPENMP */
-    for(int r = 0; r < upper_dims; ++r)
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        const T *src_row_ptr = src.data() + r * lower_dims;
-        T       *dst_row_ptr = dst.data() + r * lower_dims;
-
-        // Find max
-        const T max = *std::max_element(src_row_ptr, src_row_ptr + lower_dims);
+        // Find max along axis
+        Coordinates offset(id);
+        offset.set(actual_axis, 0);
+        T max = *reinterpret_cast<const T *>(src(offset));
+        for(unsigned int axis_id = 1; axis_id < axis_dimension; ++axis_id)
+        {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(src(offset));
+            if(val > max)
+            {
+                max = val;
+            }
+        }
 
         // Regularize
         T sum(0.f);
-        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta, is_log](T val)
+        for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id)
         {
-            T res{ (val - max) *beta };
-
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(src(offset));
+            T       res{ (val - max) *beta };
             if(is_log)
             {
                 sum += std::exp(res);
@@ -77,50 +82,52 @@ SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, in
                 res = std::exp(res);
                 sum += res;
             }
-            return res;
-        });
+            *reinterpret_cast<T *>(dst(offset)) = res;
+        }
 
         // Normalize
-        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum, is_log](T val)
+        for(unsigned int axis_id = 0; axis_id < axis_dimension; ++axis_id)
         {
+            offset.set(actual_axis, axis_id);
+            const T val = *reinterpret_cast<const T *>(dst(offset));
             if(is_log)
             {
-                return val - static_cast<T>(std::log(sum));
+                *reinterpret_cast<T *>(dst(offset)) = val - static_cast<T>(std::log(sum));
             }
             else
             {
-                return val / sum;
+                *reinterpret_cast<T *>(dst(offset)) = val / sum;
             }
-        });
-    }
-
+        }
+    });
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis, bool is_log);
-template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis, bool is_log);
+template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
-    return softmax_layer_generic<T>(src, beta, reduce_end_axis, false);
+    return softmax_layer_generic<T>(src, beta, axis, is_log);
 }
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
 {
-    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), false);
+    const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, reduce_end_axis);
+    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis, is_log);
     SimpleTensor<T>     dst     = convert_to_asymmetric<T>(dst_tmp, output_quantization_info);
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t reduce_end_axis);
-template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t axis, bool is_log);
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index 2af0b6d36a..3362f195c9 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -36,13 +36,13 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis, bool is_log = false);
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log = false);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = 0, bool is_log = false);
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type = 0 >
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = 0, bool is_log = false);
 } // namespace reference
 } // namespace validation
 } // namespace test
author	SiCong Li <sicong.li@arm.com>	2020-08-21 12:28:30 +0100
committer	SiCong Li <sicong.li@arm.com>	2020-08-25 14:12:07 +0000
commit	96209c73b071bb65d4919fb441076f977095a31b (patch)
tree	50252f1a33992b3a6171c6b2becf6da1b6f0022d
parent	5111264954e2d1a4d3e91d23a0869a0d7105be4c (diff)
download	ComputeLibrary-96209c73b071bb65d4919fb441076f977095a31b.tar.gz