From 51a9c3cccd5aded7e6e29187fdcba8760722539f Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Thu, 22 Aug 2019 16:20:04 +0100 Subject: COMPMID-2336: Validate multiplier and offset calculation in LSTMQuantized and DepthwiseConvolution functions This patch also adds validation of internal functions in LSTMQuantizedLayer. Change-Id: Id8dbbfbb421f7d053410476b4bb4ef7d85e5f41e Signed-off-by: Michele Di Giorgio Signed-off-by: giuros01 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/1794 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp | 11 ++ .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 11 ++ .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 30 +++- src/runtime/CL/functions/CLLSTMLayerQuantized.cpp | 177 +++++++++++++++++++-- .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 16 +- .../NEON/functions/NELSTMLayerQuantized.cpp | 177 +++++++++++++++++++-- tests/validation/CL/DepthwiseConvolutionLayer.cpp | 4 +- 7 files changed, 400 insertions(+), 26 deletions(-) diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp index c91dcece80..42e5fbc8f2 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp @@ -80,6 +80,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } + if(is_qasymm) + { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + } + return Status{}; } diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index c78ad1a5b5..b8b144dbfa 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -96,6 +96,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); } + if(is_qasymm) + { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info; + + float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + } + return Status{}; } diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 451ccc4956..f01b58a8b3 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -142,9 +142,10 @@ Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); + const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); const bool is_nhwc = input->data_layout() == DataLayout::NHWC; const bool needs_permute = is_nhwc && (depth_multiplier > 1); - const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1); + const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized; const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1)); const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1); const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); @@ -152,6 +153,17 @@ Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const info.c0 = 4; info.transpose = is_stride_1_dilation_1 && is_dot8_supported; + if(is_quantized) + { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform(); + + const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + } + if(needs_permute) { TensorShape permuted_input_shape = input->tensor_shape(); @@ -177,7 +189,10 @@ Status CLDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, &weights->clone()->set_tensor_shape(reshaped_weights_shape), biases, output, conv_info, depth_multiplier, act_info, dilation)); } - ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); + else + { + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3NHWCKernel::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation)); + } } else { @@ -373,7 +388,7 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe const bool can_run_optimised_3x3_kernel = (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3); - if(can_run_optimised_3x3_kernel) + if(!can_run_optimised_3x3_kernel) { const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); @@ -415,6 +430,13 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe if(is_quantized) { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = (output->total_size() == 0) ? iq_info : output->quantization_info().uniform(); + + const float multiplier = iq_info.scale * wq_info.scale / oq_info.scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output)); } @@ -426,7 +448,7 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe } else { - CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayer3x3::validate(input, weights, biases, output, conv_info, depth_multiplier, act_info, GPUTarget::MIDGARD, dilation)); } return Status{}; } diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp index e0006a77d0..11cf85e96e 100644 --- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp +++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp @@ -168,15 +168,30 @@ void CLLSTMLayerQuantized::configure(const ICLTensor *input, _bias.allocator()->allocate(); // Get the gate tensors - _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); - _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); - _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); - _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); - _output_lowp.allocator()->allocate(); + if(batch_size > 1) + { + _memory_group.manage(&_input_gate_input); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _memory_group.manage(&_forget_gate_input); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _memory_group.manage(&_input_modulation_gate_input); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _memory_group.manage(&_output_gate_input); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _output_lowp.allocator()->allocate(); + } + else + { + _memory_group.manage(&_input_gate_input); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _memory_group.manage(&_forget_gate_input); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _memory_group.manage(&_input_modulation_gate_input); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _memory_group.manage(&_output_gate_input); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _output_lowp.allocator()->allocate(); + } // Forget gate _memory_group.manage(&_forget_gate_output); @@ -286,6 +301,150 @@ Status CLLSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); + // Validate internal functions + // _concat_input_weights + std::vector inputs_weights_vector; + inputs_weights_vector.emplace_back(input_to_input_weights); + inputs_weights_vector.emplace_back(input_to_forget_weights); + inputs_weights_vector.emplace_back(input_to_cell_weights); + inputs_weights_vector.emplace_back(input_to_output_weights); + const QuantizationInfo qweights = input_to_input_weights->quantization_info(); // Weights quantization + const TensorInfo input_weights(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_weights_vector, &input_weights, Window::DimY)); + + // _concat_recurrent_weights + std::vector recurrent_weights_vector; + recurrent_weights_vector.emplace_back(recurrent_to_input_weights); + recurrent_weights_vector.emplace_back(recurrent_to_forget_weights); + recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); + recurrent_weights_vector.emplace_back(recurrent_to_output_weights); + const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + + // _concat_weights + std::vector weights_vector; + weights_vector.emplace_back(&recurrent_weights); + weights_vector.emplace_back(&input_weights); + const TensorInfo weights(TensorShape(input_size + output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); + // _transpose_weights + const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed)); + + // _concat_inputs + std::vector input_vector; + input_vector.emplace_back(input); + input_vector.emplace_back(output_state_in); + TensorInfo input_concatenated(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(input_vector, &input_concatenated, Window::DimX)); + + // _concat_bias + std::vector bias_vector; + bias_vector.emplace_back(input_gate_bias); + bias_vector.emplace_back(forget_gate_bias); + bias_vector.emplace_back(cell_bias); + bias_vector.emplace_back(output_gate_bias); + + const TensorInfo bias_concatenated(TensorShape(4 * output_size), 1, DataType::S32); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(bias_vector, &bias_concatenated, Window::DimX)); + + // Invert the offset for gemmlowp + input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); + weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + + // _gemmlowp + const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + + // Set the offset back + input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); + weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + + // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) + const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3); + + const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + // _output_stage + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + + TensorInfo input_gate_input; + TensorInfo forget_gate_input; + TensorInfo input_modulation_gate_input; + TensorInfo output_gate_input; + + if(batch_size > 1) + { + // _slice_input_tensor + input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + // _slice_forget_tensor + forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + // _slice_cell_tensor + input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + // _slice_output_tensor + output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + } + else + { + // _slice_input_tensor + input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + // _slice_forget_tensor + forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + // _slice_cell_tensor + input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + // _slice_output_tensor + output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + } + + // _sigmoid_forget_gate + const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + // _sigmoid_input_gate + const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + // _tanh_modulation_gate + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + // _sigmoid_output_gate + const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + + // _mul_forget_gate_cell_state + const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + // _mul_input_gate_input_mod_gate + const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + // _add_cell_state_tmps + ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + + // _tanh_modulation_gate + const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + + // _mul_output_state_tmp_output_gate + const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + // _dequantize + const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&output_state_out_symm, &output_state_out_f32)); + + // _quantize + ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out)); + if(cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index cdd278b2f1..fbdee84474 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -583,10 +583,22 @@ Status NEDepthwiseConvolutionLayerOptimized::validate(const ITensorInfo ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); } + const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); + + if(is_quantized) + { + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + + float multiplier = (iq_info.scale * wq_info.scale) / oq_info.scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + } + if(!NEDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(input, weights, conv_info, depth_multiplier, dilation)) { - const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); + TensorInfo accumulator = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier, dilation)); if(is_quantized) diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index 6cfa9887ff..264cca0bf2 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -147,15 +147,30 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _bias.allocator()->allocate(); // Get the gate tensors - _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); - _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); - _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); - _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); - _output_lowp.allocator()->allocate(); + if(batch_size > 1) + { + _memory_group.manage(&_input_gate_input); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _memory_group.manage(&_forget_gate_input); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _memory_group.manage(&_input_modulation_gate_input); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _memory_group.manage(&_output_gate_input); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _output_lowp.allocator()->allocate(); + } + else + { + _memory_group.manage(&_input_gate_input); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _memory_group.manage(&_forget_gate_input); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _memory_group.manage(&_input_modulation_gate_input); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _memory_group.manage(&_output_gate_input); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _output_lowp.allocator()->allocate(); + } // Forget gate _memory_group.manage(&_forget_gate_output); @@ -265,6 +280,150 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); + // Validate internal functions + // _concat_input_weights + std::vector inputs_weights_vector; + inputs_weights_vector.emplace_back(input_to_input_weights); + inputs_weights_vector.emplace_back(input_to_forget_weights); + inputs_weights_vector.emplace_back(input_to_cell_weights); + inputs_weights_vector.emplace_back(input_to_output_weights); + const QuantizationInfo qweights = input_to_input_weights->quantization_info(); // Weights quantization + const TensorInfo input_weights(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights); + ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_weights_vector, &input_weights, Window::DimY)); + + // _concat_recurrent_weights + std::vector recurrent_weights_vector; + recurrent_weights_vector.emplace_back(recurrent_to_input_weights); + recurrent_weights_vector.emplace_back(recurrent_to_forget_weights); + recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); + recurrent_weights_vector.emplace_back(recurrent_to_output_weights); + const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); + ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + + // _concat_weights + std::vector weights_vector; + weights_vector.emplace_back(&recurrent_weights); + weights_vector.emplace_back(&input_weights); + const TensorInfo weights(TensorShape(input_size + output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); + ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); + // _transpose_weights + const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed)); + + // _concat_inputs + std::vector input_vector; + input_vector.emplace_back(input); + input_vector.emplace_back(output_state_in); + TensorInfo input_concatenated(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm); + ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(input_vector, &input_concatenated, Window::DimX)); + + // _concat_bias + std::vector bias_vector; + bias_vector.emplace_back(input_gate_bias); + bias_vector.emplace_back(forget_gate_bias); + bias_vector.emplace_back(cell_bias); + bias_vector.emplace_back(output_gate_bias); + + const TensorInfo bias_concatenated(TensorShape(4 * output_size), 1, DataType::S32); + ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(bias_vector, &bias_concatenated, Window::DimX)); + + // Invert the offset for gemmlowp + input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); + weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + + // _gemmlowp + const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + + // Set the offset back + input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); + weights_transposed.set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + + // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) + const TensorInfo output_lowp(output_highp.tensor_shape(), 1, DataType::QSYMM16, qsymm_3); + + const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; + ARM_COMPUTE_UNUSED(multiplier); + ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f); + // _output_stage + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp)); + + TensorInfo input_gate_input; + TensorInfo forget_gate_input; + TensorInfo input_modulation_gate_input; + TensorInfo output_gate_input; + + if(batch_size > 1) + { + // _slice_input_tensor + input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + // _slice_forget_tensor + forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + // _slice_cell_tensor + input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + // _slice_output_tensor + output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + } + else + { + // _slice_input_tensor + input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + // _slice_forget_tensor + forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + // _slice_cell_tensor + input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + // _slice_output_tensor + output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + } + + // _sigmoid_forget_gate + const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + // _sigmoid_input_gate + const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + // _tanh_modulation_gate + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + // _sigmoid_output_gate + const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + + // _mul_forget_gate_cell_state + const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + // _mul_input_gate_input_mod_gate + const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + // _add_cell_state_tmps + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + + // _tanh_modulation_gate + const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + + // _mul_output_state_tmp_output_gate + const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + + // _dequantize + const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); + ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&output_state_out_symm, &output_state_out_f32)); + + // _quantize + ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out)); + if(cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp index 4db363fdbc..1c3dc308c7 100644 --- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp @@ -48,7 +48,7 @@ constexpr RelativeTolerance tolerance_f32(0.01f); /**< constexpr AbsoluteTolerance tolerance_qasymm8(0); /**< Tolerance value for comparing reference's output against implementation's output for DataType::QASYMM8 */ constexpr float tolerance_num = 0.05f; /**< Tolerance number */ -const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5 }); +const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5 }); const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 }); //Activation Functions @@ -273,7 +273,7 @@ DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true, true })), input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, dilation, expected) { - bool is_valid = bool(CLDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier,ActivationLayerInfo(), dilation)); + bool is_valid = bool(CLDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(true), &weights_info.clone()->set_is_resizable(true), &biases_info.clone()->set_is_resizable(true), &output_info.clone()->set_is_resizable(true), conv_info, depth_multiplier,ActivationLayerInfo(), dilation)); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } // clang-format on -- cgit v1.2.1