aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2019-10-29 10:58:13 +0000
committerMichele Di Giorgio <michele.digiorgio@arm.com>2019-12-20 14:05:24 +0000
commitf29d1b7d8bf2d1619554eb3443556b44d4aa1a4c (patch)
tree0a427f7fda2131f39e055f27b97f0a612aff990c /src
parent748a7c81245ae81d04607b3a762cf65cd39026f2 (diff)
downloadComputeLibrary-f29d1b7d8bf2d1619554eb3443556b44d4aa1a4c.tar.gz
COMPMID-2608: Enable quantization with multiplier greater than 1 on NEON
Change-Id: Ib2b0c9ac88fc2b645f478c9981f71ee28f2c77fd Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/2425 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp33
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp7
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp8
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp7
-rw-r--r--src/core/utils/quantization/AsymmHelpers.cpp10
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp12
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp23
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp13
-rw-r--r--src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp10
11 files changed, 81 insertions, 51 deletions
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index a9a3183c5d..aee13ee578 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
@@ -289,7 +289,16 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
}
- acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), output_multiplier.at(id.x() + i)), output_shift.at(id.x() + i)) + output_qoffset;
+ const int out_mul = output_multiplier.at(id.x() + i);
+ const int out_shift = output_shift.at(id.x() + i);
+ if(out_shift < 0)
+ {
+ acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+ }
+ else
+ {
+ acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
+ }
out_vals[i] = static_cast<T>(utility::clamp<int32_t, uint8_t>(acc.at(i)));
}
@@ -381,21 +390,20 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
if(has_biases)
{
- const auto biases_val = *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+ }
- int32_t out_val = acc.at(m) + biases_val;
- out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(out_val, output_multiplier.at(id.x() + m)),
- output_shift.at(id.x() + m))
- + output_qoffset;
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, uint8_t>(out_val));
+ const int out_mul = output_multiplier.at(id.x() + m);
+ const int out_shift = output_shift.at(id.x() + m);
+ if(out_shift < 0)
+ {
+ acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
}
else
{
- int32_t out_val = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), output_multiplier.at(id.x() + m)),
- output_shift.at(id.x() + m))
- + output_qoffset;
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, uint8_t>(out_val));
+ acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
}
+ *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, uint8_t>(acc.at(m)));
}
},
input_it, weights_it, biases_it, output_it);
@@ -531,8 +539,7 @@ void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, co
int32_t out_mult = 0;
int32_t out_shift = 0;
const float multiplier = input_scale * weights_scale.at(i) / output_scale;
- ARM_COMPUTE_ERROR_ON(multiplier > 1.f);
- arm_compute::quantization::calculate_quantized_multiplier_less_than_one(multiplier, &out_mult, &out_shift);
+ arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
_output_multiplier.push_back(out_mult);
_output_shift.push_back(out_shift);
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 4313a5e312..8834d9747a 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -38,14 +38,15 @@
#include <cstddef>
#include <cstdint>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+ ARM_COMPUTE_UNUSED(result_shift);
ARM_COMPUTE_UNUSED(result_offset_after_shift);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
@@ -53,7 +54,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
DataType::F16,
DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(result_shift < 0, "Result shift must be a non negative integer");
if(bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::S32, DataType::F32);
@@ -596,3 +596,4 @@ void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const
(*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 84187332f8..86abb2d65c 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -909,8 +909,12 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
&& mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
// Check if we need to clamp the result using min and max
- const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
- && !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
+ PixelValue type_min = 0;
+ PixelValue type_max = 0;
+ std::tie(type_min, type_max) = get_min_max(output->info()->data_type());
+ int type_min_int = type_min.get<int>();
+ int type_max_int = type_max.get<int>();
+ const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound == type_min_int && output_stage.gemmlowp_max_bound == type_max_int);
// Check if we need to perform fixed point requantization
const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 4906e6a987..bb0b86404e 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -39,8 +39,8 @@
#include <cstddef>
#include <cstdint>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
@@ -244,4 +244,5 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
(this->*_func)(window);
-} \ No newline at end of file
+}
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 11241e83a0..5bda746e09 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -106,10 +106,10 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier,
return Status{};
}
-arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info,
- const QuantizationInfo &wq_info,
- const QuantizationInfo &oq_info,
- GEMMLowpOutputStageInfo &stage_info)
+arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
+ const QuantizationInfo &wq_info,
+ const QuantizationInfo &oq_info,
+ GEMMLowpOutputStageInfo &stage_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty());
@@ -131,7 +131,7 @@ arm_compute::Status calculate_quantized_multipliers_less_than_one(const Quantiza
const float multiplier = i_scale * w_scales[i] / o_scale;
int32_t quant_multiplier = 0;
int32_t quant_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier(multiplier, &quant_multiplier, &quant_shift));
quant_multipliers[i] = quant_multiplier;
quant_shifts[i] = quant_shift;
}
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index ca4fe732a7..ddcc71f466 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -60,16 +60,6 @@ Status validate_arguments_optimized(const ITensorInfo *input, const ITensorInfo
const bool is_quantized = (!is_data_type_quantized_per_channel(weights->data_type())) && is_data_type_quantized_asymmetric(input->data_type());
- if(is_quantized)
- {
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
-
- float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
- }
if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation))
{
TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -205,7 +195,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
int32_t output_multiplier;
int32_t output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, oq_info.offset);
_accumulator.allocator()->allocate();
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 322bb2c425..65538848df 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -31,8 +31,8 @@
#include <cmath>
#include <tuple>
-using namespace arm_compute;
-
+namespace arm_compute
+{
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
_is_activationlayer_enabled(false), _dim_split(Window::DimZ)
@@ -118,3 +118,4 @@ void NEDirectConvolutionLayer::run()
_activationlayer_function.run();
}
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index b3b90f8599..01746eb3db 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -33,7 +33,8 @@
#include <algorithm>
#include <cmath>
-using namespace arm_compute;
+namespace arm_compute
+{
using namespace arm_compute::misc::shape_calculator;
namespace
@@ -258,7 +259,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
int32_t output_multiplier;
int32_t output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, oq_info.offset);
_gemmlowp_output.allocator()->allocate();
}
@@ -352,13 +353,14 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
// Validate output stage for asymmetric quantized types
if(is_quantized)
{
- const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
- const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
-
- ARM_COMPUTE_UNUSED(multiplier);
- ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f);
+ const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+ const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
+
+ float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale;
+ int output_multiplier;
+ int output_shift;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
}
@@ -475,4 +477,5 @@ void NEFullyConnectedLayer::prepare()
_is_prepared = true;
}
-} \ No newline at end of file
+}
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 0507c6b2bd..920917a58b 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -154,7 +154,7 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
output_info.gemmlowp_min_bound = min_activation;
output_info.gemmlowp_max_bound = max_activation;
output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
- quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
+ quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
@@ -217,7 +217,7 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
output_info.gemmlowp_min_bound = min_activation;
output_info.gemmlowp_max_bound = max_activation;
output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
// Perform validation step on GEMMLowp
std::unique_ptr<ITensorInfo> input_qa = input->clone();
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index e36cb3d399..440f043527 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -344,6 +344,19 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
{
run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
run_optimised_requantized = run_optimised;
+
+ const UniformQuantizationInfo a_qinfo = a_to_use->quantization_info().uniform();
+ const QuantizationInfo b_qinfo = b->quantization_info();
+ const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
+ for(auto const s : b_qinfo.scale())
+ {
+ const float fmultipler = a_qinfo.scale * s / output_qinfo.scale;
+ if(fmultipler > 1.f)
+ {
+ run_optimised_requantized = false;
+ break;
+ }
+ }
}
else
{
diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index 3235eee19a..142f873ef4 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
@@ -437,6 +437,16 @@ Status NEDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
+ // The uniform quantization case will only have 1 scale value in the weights quantization info
+ const UniformQuantizationInfo input_qinfo = input->quantization_info().uniform();
+ const QuantizationInfo weights_qinfo = weights->quantization_info();
+ const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
+ for(auto const s : weights_qinfo.scale())
+ {
+ const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
+ ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
+ }
+
return Status{};
}