aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/runtime/CL/functions/CLSoftmaxLayer.h19
-rw-r--r--src/runtime/CL/functions/CLSoftmaxLayer.cpp99
-rw-r--r--tests/datasets/ShapeDatasets.h18
-rw-r--r--tests/validation/CL/SoftmaxLayer.cpp47
-rw-r--r--tests/validation/reference/SoftmaxLayer.cpp20
5 files changed, 174 insertions, 29 deletions
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 34349ed52b..90c99d6569 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -24,6 +24,8 @@
#ifndef __ARM_COMPUTE_CLSOFTMAXLAYER_H__
#define __ARM_COMPUTE_CLSOFTMAXLAYER_H__
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
@@ -71,12 +73,29 @@ public:
void run() override;
private:
+ /** Utility method to configure the kernels needed to flatten the input
+ * tensor.
+ *
+ * @note This function changes the internal state of this class. In particular,
+ * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
+ * @p _output_flat
+ *
+ * @param[in] input Original source tensor.
+ * @param[in] output Original destination tensor.
+ */
+ void configure_flatten_kernel(const ICLTensor *input, const ICLTensor *output);
+
CLMemoryGroup _memory_group;
CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
CLLogits1DNormKernel _norm_kernel;
+ CLFlattenLayerKernel _flatten_kernel;
+ CLReshapeLayerKernel _reshape_kernel;
CLTensor _max;
CLTensor _sum;
CLTensor _tmp;
+ CLTensor _input_flat;
+ CLTensor _output_flat;
+ bool _needs_flattening;
};
}
#endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7a20d9f94b..3a7d6c770b 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -29,14 +29,32 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+ : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flat(), _output_flat(),
+ _needs_flattening(false)
+{
+}
+
+void CLSoftmaxLayer::configure_flatten_kernel(const ICLTensor *input, const ICLTensor *output)
{
+ // Flatten the input
+ const TensorShape shape_flatten = misc::shape_calculator::compute_flatten_shape(input->info());
+
+ // Initialize the flat input
+ _input_flat.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+ // Configure the flatten_kernel
+ _flatten_kernel.configure(input, &_input_flat);
+
+ // We need to init the output tensor here. Indeed, the reshape kernel expects
+ // both tensors to be already initialized
+ auto_init_if_empty(*output->info(), *input->info()->clone());
}
void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
@@ -45,13 +63,32 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info()));
+ _needs_flattening = input->info()->num_dimensions() > 2;
+
+ // If we are dealing with a 4D tensor, we will:
+ // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+ // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+ // - Reshape the flattened output into the real output
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _input_flat
+ _memory_group.manage(&_input_flat);
+
+ // Cofigure _flatten_kernel and _input_flat
+ configure_flatten_kernel(input, output);
+ }
+
+ // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+ // or it is the original input case (2D case)
+ const ICLTensor *input_2D = (_needs_flattening ? &_input_flat : input);
+
// Create intermediate tensors shapes
- const TensorInfo input_info = input->info()->clone()->reset_padding().set_is_resizable(true);
- DataType tmp_data_type = is_data_type_quantized_asymmetric(input->info()->data_type()) ? DataType::S32 : input->info()->data_type();
- TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+ TensorInfo input_info = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
_tmp.allocator()->init(tensor_info_tmp);
- TensorShape max_sum_shape = input->info()->tensor_shape();
+ TensorShape max_sum_shape = input_2D->info()->tensor_shape();
max_sum_shape.set(0, 1);
_max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
_sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
@@ -65,8 +102,28 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
_memory_group.manage(&_sum);
// Configure kernels
- _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
- _norm_kernel.configure(&_tmp, &_sum, output, beta);
+ _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+
+ if(_needs_flattening)
+ {
+ // Add to the memory manager _output_flat
+ _memory_group.manage(&_output_flat);
+
+ // The normalization kernel stores the result in a flat output tensor
+ _norm_kernel.configure(&_tmp, &_sum, &_output_flat, beta);
+
+ // Reshape the flat output into a the requested (4D) output
+ _reshape_kernel.configure(&_output_flat, output);
+
+ // Allocate the intermediate flat tensors
+ _input_flat.allocator()->allocate();
+ _output_flat.allocator()->allocate();
+ }
+ else
+ {
+ // Softmax 2D case
+ _norm_kernel.configure(&_tmp, &_sum, output, beta);
+ }
// Allocate intermediate buffers
_tmp.allocator()->allocate();
@@ -77,7 +134,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
// Create intermediate tensor info
DataType tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -88,6 +145,14 @@ Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
+ const TensorShape shape_flatten = misc::shape_calculator::compute_flatten_shape(input);
+ TensorInfo tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+ if(input->num_dimensions() > 2) // needs flattening
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+ }
+
ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
@@ -97,9 +162,21 @@ Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
void CLSoftmaxLayer::run()
{
_memory_group.acquire();
+ if(_needs_flattening)
+ {
+ CLScheduler::get().enqueue(_flatten_kernel, false);
+ }
CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
- CLScheduler::get().enqueue(_norm_kernel);
+ CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
+ if(_needs_flattening)
+ {
+ CLScheduler::get().enqueue(_reshape_kernel, true);
+ }
+
+ // Relase intermediate buffers
_memory_group.release();
}
+
+} // namespace arm_compute
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 4d75a16e47..c7955bc8c5 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -794,6 +794,24 @@ public:
TensorShape{ 1000U, 10U },
TensorShape{ 3989U, 10U },
TensorShape{ 7339U, 11U },
+
+ })
+ {
+ }
+};
+
+/** Data set containing large and small softmax layer 4D shapes. */
+class SoftmaxLayer4DShapes final : public ShapeDataset
+{
+public:
+ SoftmaxLayer4DShapes()
+ : ShapeDataset("Shape",
+ {
+ TensorShape{ 9U, 9U, 9U, 9U },
+ TensorShape{ 256U, 10U, 1U, 9U },
+ TensorShape{ 353U, 8U, 2U },
+ TensorShape{ 781U, 5U, 2U, 2U },
+ TensorShape{ 781U, 11U, 1U, 2U },
})
{
}
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 66ca0b8ca7..7dab626b58 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -82,16 +82,20 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datase
validate(src.info()->valid_region(), valid_region);
validate(dst.info()->valid_region(), valid_region);
- // Get reduction kernel info
- CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
-
- // Validate src padding
- const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
- validate(src.info()->padding(), padding_src);
-
- // Validate dst padding
- const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
- validate(dst.info()->padding(), padding_dst);
+ // CLLogits1DMaxShiftExpSumKernel configures the paddings only in the 2D case
+ if(shape.num_dimensions() <= 2)
+ {
+ // Get reduction kernel info
+ CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(shape.x());
+
+ // Validate src padding for 2D softmax
+ const PaddingSize padding_src = PaddingCalculator(shape.x(), std::get<1>(reduction_info)).required_padding();
+ validate(src.info()->padding(), padding_src);
+
+ // Validate dst padding for 2D softmax
+ const PaddingSize padding_dst = PaddingCalculator(shape.x(), 16).required_padding();
+ validate(dst.info()->padding(), padding_dst);
+ }
}
// *INDENT-OFF*
@@ -144,6 +148,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::Dataset
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f16);
}
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+ framework::dataset::make("DataType", DataType::F16)),
+ framework::dataset::make("Beta", { 1.0f, 2.0f })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f16);
+}
TEST_SUITE_END()
TEST_SUITE(FP32)
@@ -161,6 +172,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::Datase
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+ framework::dataset::make("DataType", DataType::F32)),
+ framework::dataset::make("Beta", { 1.0f, 2.0f })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
TEST_SUITE_END()
TEST_SUITE_END()
@@ -185,6 +203,15 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerQuantizedFixture<uint8_t>, framew
// Validate output
validate(CLAccessor(_target), _reference, tolerance_qasymm8);
}
+FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::SoftmaxLayer4DShapes(),
+ framework::dataset::make("DataType", DataType::QASYMM8)),
+ combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+ framework::dataset::make("Beta", { 1.0f, 2.0f }))))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
TEST_SUITE_END()
TEST_SUITE_END()
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index aa640ad5e6..7f2c36ecef 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -39,21 +39,25 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
// Create reference
SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
- // Compute reference
- const int cols = src.shape()[0];
- const int upper_dims = src.num_elements() / cols;
+ const bool is_4D_input = (src.shape().num_dimensions() > 2);
+
+ // Compute reference. Lower dims are
+ // - the number of columns for the 2D case
+ // - the collapsing of the first three dimensions (i.e., the flattened dimension of each batch) in the 4D case
+ const int lower_dims = (is_4D_input ? src.shape()[2] * src.shape()[1] * src.shape()[0] : src.shape()[0]);
+ const int upper_dims = src.num_elements() / lower_dims;
for(int r = 0; r < upper_dims; ++r)
{
- const T *src_row_ptr = src.data() + r * cols;
- T *dst_row_ptr = dst.data() + r * cols;
+ const T *src_row_ptr = src.data() + r * lower_dims;
+ T *dst_row_ptr = dst.data() + r * lower_dims;
// Find max
- const T max = *std::max_element(src_row_ptr, src_row_ptr + cols);
+ const T max = *std::max_element(src_row_ptr, src_row_ptr + lower_dims);
// Regularize
T sum(0.f);
- std::transform(src_row_ptr, src_row_ptr + cols, dst_row_ptr, [&sum, max, beta](T val)
+ std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta](T val)
{
const T res(std::exp((val - max) * beta));
sum += res;
@@ -61,7 +65,7 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta)
});
// Normalize
- std::transform(dst_row_ptr, dst_row_ptr + cols, dst_row_ptr, [sum](T val)
+ std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum](T val)
{
return val / sum;
});