5 files changed, 174 insertions, 29 deletions
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 34349ed52b..90c99d6569 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -24,6 +24,8 @@
 #ifndef __ARM_COMPUTE_CLSOFTMAXLAYER_H__
 #define __ARM_COMPUTE_CLSOFTMAXLAYER_H__
 
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -71,12 +73,29 @@ public:
     void run() override;
 
 private:
+    /** Utility method to configure the kernels needed to flatten the input
+     * tensor.
+     *
+     * @note This function changes the internal state of this class. In particular,
+     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
+     * @p _output_flat
+     *
+     * @param[in] input  Original source tensor.
+     * @param[in] output Original destination tensor.
+     */
+    void configure_flatten_kernel(const ICLTensor *input, const ICLTensor *output);
+
     CLMemoryGroup                  _memory_group;
     CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
     CLLogits1DNormKernel           _norm_kernel;
+    CLFlattenLayerKernel           _flatten_kernel;
+    CLReshapeLayerKernel           _reshape_kernel;
     CLTensor                       _max;
     CLTensor                       _sum;
     CLTensor                       _tmp;
+    CLTensor                       _input_flat;
+    CLTensor                       _output_flat;
+    bool                           _needs_flattening;
 };
 }
 #endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7a20d9f94b..3a7d6c770b 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -29,14 +29,32 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flat(), _output_flat(),
+      _needs_flattening(false)
+{
+}
+
+void CLSoftmaxLayer::configure_flatten_kernel(const ICLTensor *input, const ICLTensor *output)
 {
+    // Flatten the input
+    const TensorShape shape_flatten = misc::shape_calculator::compute_flatten_shape(input->info());
+
+    // Initialize the flat input
+    _input_flat.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+    // Configure the flatten_kernel
+    _flatten_kernel.configure(input, &_input_flat);
+
+    // We need to init the output tensor here. Indeed, the reshape kernel expects
+    // both tensors to be already initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
 void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
@@ -45,13 +63,32 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info()));
 
+    _needs_flattening = input->info()->num_dimensions() > 2;
+
+    // If we are dealing with a 4D tensor, we will:
+    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+    // - Reshape the flattened output into the real output
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _input_flat
+        _memory_group.manage(&_input_flat);
+
+        // Cofigure  _flatten_kernel and _input_flat
+        configure_flatten_kernel(input, output);
+    }
+
+    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+    // or it is the original input case (2D case)
+    const ICLTensor *input_2D = (_needs_flattening ? &_input_flat : input);
+
     // Create intermediate tensors shapes
-    const TensorInfo input_info    = input->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input->info()->data_type()) ? DataType::S32 : input->info()->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
+    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
     _tmp.allocator()->init(tensor_info_tmp);
 
-    TensorShape max_sum_shape = input->info()->tensor_shape();
+    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
     max_sum_shape.set(0, 1);
     _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
     _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
@@ -65,8 +102,28 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
     _memory_group.manage(&_sum);
 
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
-    _norm_kernel.configure(&_tmp, &_sum, output, beta);
+    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _output_flat
+        _memory_group.manage(&_output_flat);
+
+        // The normalization kernel stores the result in a flat output tensor
+        _norm_kernel.configure(&_tmp, &_sum, &_output_flat, beta);
+
+        // Reshape the flat output into a the requested (4D) output
+        _reshape_kernel.configure(&_output_flat, output);
+
+        // Allocate the intermediate flat tensors
+        _input_flat.allocator()->allocate();
+        _output_flat.allocator()->allocate();
+    }
+    else
+    {
+        // Softmax 2D case
+        _norm_kernel.configure(&_tmp, &_sum, output, beta);
+    }
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
@@ -77,7 +134,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
 Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -88,6 +145,14 @@ Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
+    const TensorShape shape_flatten = misc::shape_calculator::compute_flatten_shape(input);
+    TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+    if(input->num_dimensions() > 2) // needs flattening
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+    }
+
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
 
@@ -97,9 +162,21 @@ Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
 void CLSoftmaxLayer::run()
 {
     _memory_group.acquire();
+    if(_needs_flattening)
+    {
+        CLScheduler::get().enqueue(_flatten_kernel, false);
+    }
 
     CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    CLScheduler::get().enqueue(_norm_kernel);
+    CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
 
+    if(_needs_flattening)
+    {
+        CLScheduler::get().enqueue(_reshape_kernel, true);
+    }
+
+    // Relase intermediate buffers
     _memory_group.release();
 }
+
+} // namespace arm_compute
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 4d75a16e47..c7955bc8c5 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -794,6 +794,24 @@ public:
         TensorShape{ 1000U, 10U },
                      TensorShape{ 3989U, 10U },
                      TensorShape{ 7339U, 11U },
+
+    })
+    {
+    }
+};
+
+/** Data set containing large and small softmax layer 4D shapes. */
+class SoftmaxLayer4DShapes final : public ShapeDataset
+{
+public:
+    SoftmaxLayer4DShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 9U, 9U, 9U, 9U },
+                     TensorShape{ 256U, 10U, 1U, 9U },
+                     TensorShape{ 353U, 8U, 2U },
+                     TensorShape{ 781U, 5U, 2U, 2U },
+                     TensorShape{ 781U, 11U, 1U, 2U },
     })
     {
     }
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 66ca0b8ca7..7dab626b58 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -82,16 +82,20 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datase
     validate(src.info()->valid_region(), valid_region);
     validate(dst.info()->valid_region(), valid_region);
 
-    // Get reduction kernel info
-    CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
-
-    // Validate src padding
-    const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
-    validate(src.info()->padding(), padding_src);
-
-    // Validate dst padding
-    const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(dst.info()->padding(), padding_dst);
+    // CLLogits1DMaxShiftExpSumKernel configures the paddings only in the 2D case
+    if(shape.num_dimensions() <= 2)
+    {
+        // Get reduction kernel info
+        CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
+
+        // Validate src padding for 2D softmax
+        const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
+        validate(src.info()->padding(), padding_src);
+
+        // Validate dst padding for 2D softmax
+        const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
+        validate(dst.info()->padding(), padding_dst);
+    }
 }
 
 // *INDENT-OFF*
@@ -144,6 +148,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::Dataset
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                            framework::dataset::make("DataType", DataType::F16)),
+                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
@@ -161,6 +172,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::Datase
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                             framework::dataset::make("DataType", DataType::F32)),
+                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
 TEST_SUITE_END()
 TEST_SUITE_END()
 
@@ -185,6 +203,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerQuantizedFixture<uint8_t>, framew
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index aa640ad5e6..7f2c36ecef 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -39,21 +39,25 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
-    // Compute reference
-    const int cols       = src.shape()[0];
-    const int upper_dims = src.num_elements() / cols;
+    const bool is_4D_input = (src.shape().num_dimensions() > 2);
+
+    // Compute reference. Lower dims are
+    // - the number of columns for the 2D case
+    // - the collapsing of the first three dimensions (i.e., the flattened dimension of each batch) in the 4D case
+    const int lower_dims = (is_4D_input ? src.shape()[2] * src.shape()[1] * src.shape()[0] : src.shape()[0]);
+    const int upper_dims = src.num_elements() / lower_dims;
 
     for(int r = 0; r < upper_dims; ++r)
     {
-        const T *src_row_ptr = src.data() + r * cols;
-        T       *dst_row_ptr = dst.data() + r * cols;
+        const T *src_row_ptr = src.data() + r * lower_dims;
+        T       *dst_row_ptr = dst.data() + r * lower_dims;
 
         // Find max
-        const T max = *std::max_element(src_row_ptr, src_row_ptr + cols);
+        const T max = *std::max_element(src_row_ptr, src_row_ptr + lower_dims);
 
         // Regularize
         T sum(0.f);
-        std::transform(src_row_ptr, src_row_ptr + cols, dst_row_ptr, [&sum, max, beta](T val)
+        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta](T val)
         {
             const T res(std::exp((val - max) * beta));
             sum += res;
@@ -61,7 +65,7 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
         });
 
         // Normalize
-        std::transform(dst_row_ptr, dst_row_ptr + cols, dst_row_ptr, [sum](T val)
+        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum](T val)
         {
             return val / sum;
         });