aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOmar Al Khatib <omar.alkhatib@arm.com>2022-12-15 09:12:12 +0000
committerOmar Al Khatib <omar.alkhatib@arm.com>2022-12-21 15:03:22 +0000
commite317baf74dd6206932877e254350b9ab913426f1 (patch)
treea46999e267957cc9623f0ab644297259b0facf60
parent85260d8c21e7209d4777150f436b336f85812dce (diff)
downloadComputeLibrary-e317baf74dd6206932877e254350b9ab913426f1.tar.gz
Optimize MeanReduce by integer acc. and removing upfront dequant.
Resolves: [COMPMID-5466] Signed-off-by: Omar Al Khatib <omar.alkhatib@arm.com> Change-Id: I68af0bb54580bebd2ace1fba30aa73f7f68a4dbb Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8804 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/runtime/NEON/functions/NEReduceMean.h9
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp92
-rw-r--r--src/runtime/NEON/functions/NEReduceMean.cpp45
3 files changed, 78 insertions, 68 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 7512115a3f..caaee8284a 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,8 +28,6 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/runtime/Tensor.h"
@@ -93,13 +91,8 @@ private:
std::vector<NEReductionOperation> _reduction_kernels;
std::vector<Tensor> _reduced_outs;
NEReshapeLayer _reshape;
- NEDequantizationLayer _dequant;
- NEQuantizationLayer _requant;
int _reduction_ops;
bool _keep_dims;
- bool _do_requant;
- Tensor _input_no_quant;
- Tensor _output_no_quant;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index bfecccf94b..e0f43ab176 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -401,7 +401,8 @@ struct RedOpX
Iterator input(in, in_win_no_pad);
Iterator output(out, out_window);
- execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+ execute_window_loop(
+ in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
@@ -609,6 +610,8 @@ struct RedOpX_quantized
{
using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+ const auto oq_info = out->info()->quantization_info().uniform();
+
const TensorInfo in_info = *(in->info());
const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
@@ -622,7 +625,19 @@ struct RedOpX_quantized
Iterator input(in, in_win_no_pad);
Iterator output(out, out_window);
- execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+ const float in_offset = static_cast<float>(iq_info.offset);
+ const float in_scale = iq_info.scale;
+
+ const float out_offset = static_cast<float>(oq_info.offset);
+ const float out_scale = oq_info.scale;
+
+ const float num_elements = static_cast<float>(in_info.dimension(0));
+
+ const float A = in_scale / (out_scale * num_elements);
+ const float B = out_offset - (in_scale * in_offset) / (out_scale);
+
+ execute_window_loop(
+ in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -844,14 +859,17 @@ struct RedOpX_quantized
if(op == ReductionOperation::MEAN_SUM)
{
- res /= static_cast<int32_t>(in_info.dimension(0));
+ const int32_t resFinal = A * (static_cast<float>(res)) + B;
+
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
}
else
{
// Subtract accumulated offsets
res -= (in_info.dimension(0) - 1) * iq_info.offset;
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
}
- *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+
break;
}
default:
@@ -887,7 +905,8 @@ struct RedOpYZW
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
- execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+ execute_window_loop(
+ in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -1110,7 +1129,8 @@ struct RedOpYZW_complex
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
- execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+ execute_window_loop(
+ in_win_no_pad, [&](const Coordinates &)
{
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -1169,6 +1189,8 @@ struct RedOpYZW_quantized
const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+ const auto oq_info = out->info()->quantization_info().uniform();
+
const int window_step_x = 16 / sizeof(T);
const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
@@ -1197,7 +1219,22 @@ struct RedOpYZW_quantized
vector_type_f vec_res_value3_f{};
vector_type_f vec_res_value4_f{};
- execute_window_loop(in_win_no_pad, [&](const Coordinates &)
+ const float in_offset = static_cast<float>(iq_info.offset);
+ const float in_scale = iq_info.scale;
+
+ const float out_offset = static_cast<float>(oq_info.offset);
+ const float out_scale = oq_info.scale;
+
+ const float num_elements = static_cast<float>(in_info.dimension(axis));
+
+ const float A = in_scale / (out_scale * num_elements);
+ const float B = out_offset - (in_scale * in_offset) / (out_scale);
+
+ const auto vec_A = wrapper::vdup_n(static_cast<float>(A), wrapper::traits::vector_128_tag{});
+ const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
+
+ execute_window_loop(
+ in_win_no_pad, [&](const Coordinates &)
{
const auto input_ptr = reinterpret_cast<T *>(input.ptr());
@@ -1340,11 +1377,10 @@ struct RedOpYZW_quantized
}
case ReductionOperation::MEAN_SUM:
{
- const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
- vec_res_value1_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
- vec_res_value2_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
- vec_res_value3_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
- vec_res_value4_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
+ vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+ vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+ vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+ vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
@@ -1389,7 +1425,9 @@ struct RedOpYZW_quantized
// Compute left-over elements
for(; x < window_end_x; ++x)
{
- float res_value = 0.f;
+ float res_value = 0.f;
+ int32_t res_value_q = 0;
+
switch(op)
{
case ReductionOperation::ARG_IDX_MAX:
@@ -1419,11 +1457,15 @@ struct RedOpYZW_quantized
switch(op)
{
case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
{
res_value += *in_ptr;
break;
}
+ case ReductionOperation::MEAN_SUM:
+ {
+ res_value_q += *in_ptr;
+ break;
+ }
case ReductionOperation::SUM_SQUARE:
{
res_value += *in_ptr * *in_ptr;
@@ -1479,8 +1521,7 @@ struct RedOpYZW_quantized
{
case ReductionOperation::MEAN_SUM:
{
- int32_t res = static_cast<int32_t>(res_value);
- res /= static_cast<int32_t>(in_info.dimension(axis));
+ const int32_t res = A * (static_cast<float>(res_value_q)) + B;
*reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
break;
}
@@ -1552,30 +1593,46 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
switch(axis)
{
case 0:
+ {
switch(input->info()->data_type())
{
case DataType::QASYMM8:
+ {
return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+ }
case DataType::QASYMM8_SIGNED:
+ {
return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+ }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
+ {
return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+ }
case DataType::S32:
+ {
return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
+ }
default:
+ {
ARM_COMPUTE_ERROR("Not supported");
+ }
}
+ }
case 1:
switch(input->info()->data_type())
{
case DataType::QASYMM8:
+ {
return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ }
case DataType::QASYMM8_SIGNED:
+ {
return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
@@ -1655,7 +1712,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
if(!is_arg_min_max)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
}
else
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index a2322c9eda..9c9b79a1e5 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,9 +24,7 @@
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
@@ -85,14 +83,6 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
}
const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
- const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
- if(requant)
- {
- TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
- NEDequantizationLayer::validate(input, &input_no_quant);
- TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32));
- NEQuantizationLayer::validate(&output_no_quant, output);
- }
}
return Status{};
}
@@ -101,8 +91,7 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
NEReduceMean::~NEReduceMean() = default;
NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
- _output_no_quant()
+ : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
{
}
@@ -121,7 +110,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -129,18 +117,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
ITensor *tmp_input = input;
ITensor *tmp_output = output;
- if(_do_requant)
- {
- _memory_group.manage(&_input_no_quant);
- _memory_group.manage(&_output_no_quant);
- TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape);
- output_no_quant_info.set_data_type(DataType::F32);
- auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info);
- auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32));
- _dequant.configure(input, &_input_no_quant);
- tmp_input = &_input_no_quant;
- tmp_output = &_output_no_quant;
- }
Coordinates axis_local = reduction_axis;
const int input_dims = tmp_input->info()->num_dimensions();
@@ -160,7 +136,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info()));
_memory_group.manage(&_reduced_outs[i]);
_reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
@@ -171,7 +147,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
{
_reduced_outs[i].allocator()->allocate();
}
-
// Configure reshape layer if we want to drop the dimensions
if(!keep_dims)
{
@@ -186,21 +161,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output);
}
- if(_do_requant)
- {
- _requant.configure(&_output_no_quant, output);
- _input_no_quant.allocator()->allocate();
- _output_no_quant.allocator()->allocate();
- }
}
void NEReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_do_requant)
- {
- _dequant.run();
- }
for(auto &kernel : _reduction_kernels)
{
kernel.run();
@@ -209,9 +174,5 @@ void NEReduceMean::run()
{
_reshape.run();
}
- if(_do_requant)
- {
- _requant.run();
- }
}
} // namespace arm_compute