From 951b8a4c01de2810349b6f16cf9bbba7578484fa Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Mon, 4 Nov 2019 14:42:08 +0000 Subject: COMPMID-2309 : CLConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters Change-Id: I16f6758b768ede404a064db057302ded706e1e8a Signed-off-by: Vidhya Sudhan Loganathan Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/2215 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins --- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 16 +-- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 2 + .../CL/functions/CLGEMMConvolutionLayer.cpp | 112 ++++++++++++--------- .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp | 106 +++++++++++++++---- 4 files changed, 161 insertions(+), 75 deletions(-) (limited to 'src/runtime/CL/functions') diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index cdf3a95568..e717f793fd 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -337,9 +337,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare() { _output_multipliers.map(); _output_shifts.map(); - quantization::compute_quantized_multipliers_and_shifts(_input, - _original_weights, - _output, + const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL); + quantization::compute_quantized_multipliers_and_shifts(_input->info(), + _original_weights->info(), + _output->info(), + idx_ofms, reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); @@ -533,9 +535,11 @@ void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepar { _output_multipliers.map(); _output_shifts.map(); - quantization::compute_quantized_multipliers_and_shifts(_input, - _original_weights, - _output, + const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL); + quantization::compute_quantized_multipliers_and_shifts(_input->info(), + _original_weights->info(), + _output->info(), + idx_ofms, reinterpret_cast(_output_multipliers.ptr_to_element(Coordinates(0))), reinterpret_cast(_output_shifts.ptr_to_element(Coordinates(0)))); _output_multipliers.unmap(); diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 5bcf38d1c4..a8167ce8f7 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -68,6 +68,8 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorIn gemmlowp_output_stage.gemmlowp_shift = output_shift; gemmlowp_output_stage.gemmlowp_min_bound = 0; gemmlowp_output_stage.gemmlowp_max_bound = 255; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); } return Status{}; diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index 831f108b85..d322723150 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -66,13 +66,14 @@ void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); if(biases != nullptr) { const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(weights->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); @@ -81,7 +82,6 @@ Status CLConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, co if((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output); - CLWeightsReshapeKernel::validate(weights, biases, output, num_groups); } @@ -201,9 +201,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * const unsigned int kernel_width = weights->info()->dimension(idx_width); const unsigned int kernel_height = weights->info()->dimension(idx_height); + const unsigned int num_kernels = weights->info()->dimension(idx_kernels); const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); _is_prepared = weights_info.retain_internal_weights(); @@ -237,7 +237,7 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * conv_info, dilation); - unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels) / num_groups; + unsigned int mat_weights_cols = num_kernels / num_groups; const ICLTensor *biases_to_use = biases; bool append_bias = false; @@ -310,20 +310,28 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * } GEMMLowpOutputStageInfo gemmlowp_output_stage; - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.gemmlowp_multiplier = 0; - gemmlowp_output_stage.gemmlowp_shift = 0; + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; // Configure output stage for quantized case if(_is_quantized) { - const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info; - - const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; - int output_multiplier = 0; - int output_shift = 0; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); + const auto output_quant_info = (output->info()->total_size() == 0) ? iq_info : oq_info; + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->info()->data_type()); + const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; + + gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; + + gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); + gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); + quantization::compute_quantized_multipliers_and_shifts(input->info(), + weights->info(), + output->info(), + idx_kernels, + gemmlowp_output_stage.gemmlowp_multipliers.data(), + gemmlowp_output_stage.gemmlowp_shifts.data()); + gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; + gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; int min_activation = 0; int max_activation = 0; @@ -350,11 +358,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * } // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage.gemmlowp_shift = output_shift; - gemmlowp_output_stage.gemmlowp_min_bound = min_activation; - gemmlowp_output_stage.gemmlowp_max_bound = max_activation; + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_min_bound = min_activation; + gemmlowp_output_stage.gemmlowp_max_bound = max_activation; } // Configure and tune GEMM @@ -396,8 +402,17 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); + + if(is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::QASYMM8, "Input data type not compatible with Weights"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); @@ -412,6 +427,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const unsigned int kernel_width = weights->dimension(idx_width); const unsigned int kernel_height = weights->dimension(idx_height); + const unsigned int num_kernels = weights->dimension(idx_kernels); TensorInfo im2col_reshaped_info{}; TensorInfo info_gemm{}; @@ -419,15 +435,10 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const ITensorInfo *gemm_input_to_use = input; const ITensorInfo *gemm_output_to_use = output; const ITensorInfo *weights_to_use = weights; - - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool fuse_activation = true; - - const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); - const UniformQuantizationInfo wq_info = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); @@ -463,7 +474,7 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI conv_info, dilation); - unsigned int mat_weights_cols = weights->dimension(idx_kernels) / num_groups; + unsigned int mat_weights_cols = num_kernels / num_groups; const ITensorInfo *biases_to_use = biases; bool append_bias = false; @@ -514,20 +525,27 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI } GEMMLowpOutputStageInfo gemmlowp_output_stage; - gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage.gemmlowp_offset = 0; - gemmlowp_output_stage.gemmlowp_multiplier = 0; - gemmlowp_output_stage.gemmlowp_shift = 0; + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; if(is_quantized) { - const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info; - - const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; - int output_multiplier = 0; - int output_shift = 0; - - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift)); + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); + const auto output_quant_info = (output->total_size() == 0) ? iq_info : oq_info; + const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; + + gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); + gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); + quantization::compute_quantized_multipliers_and_shifts(input, + weights, + output, + idx_kernels, + gemmlowp_output_stage.gemmlowp_multipliers.data(), + gemmlowp_output_stage.gemmlowp_shifts.data()); + gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; + gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; int min_activation = 0; int max_activation = 0; @@ -554,11 +572,9 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI } // Set the GEMMLowp output stage info - gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; - gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage.gemmlowp_shift = output_shift; - gemmlowp_output_stage.gemmlowp_min_bound = min_activation; - gemmlowp_output_stage.gemmlowp_max_bound = max_activation; + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_min_bound = min_activation; + gemmlowp_output_stage.gemmlowp_max_bound = max_activation; } // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 0286cb3d6d..4c0a521de8 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" namespace arm_compute @@ -49,6 +50,7 @@ inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_tar CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), + _weights_to_qasymm8(), _mm_midgard_kernel(), _mm_native_kernel(), _mm_reshaped_only_rhs_kernel(), @@ -57,18 +59,24 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->quantization_info().uniform().offset; - _b_offset = b->info()->quantization_info().uniform().offset; + _matrix_a = a; + _output = output; + + _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type()) + && is_data_type_quantized_asymmetric(a->info()->data_type()); + _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset; // Get the GPU target const GPUTarget gpu_target = CLScheduler::get().target(); @@ -91,8 +104,6 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor _mm_native_kernel.set_target(gpu_target); _mm_reshaped_only_rhs_kernel.set_target(gpu_target); - const ICLTensor *matrix_a = a; - const ICLTensor *matrix_b = b; GEMMRHSMatrixInfo rhs_info; GEMMLHSMatrixInfo lhs_info; @@ -110,6 +121,16 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target); _is_midgard = gpu_target == GPUTarget::MIDGARD; + if(_convert_to_qasymm8) + { + // Set data type for converted weights + TensorInfo weights_info(*b->info()); + weights_info.set_data_type(DataType::QASYMM8); + _qasymm8_weights.allocator()->init(weights_info); + _weights_to_qasymm8.configure(b, &_qasymm8_weights, ConvertPolicy::WRAP, 0); + } + + const ICLTensor *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; if(_is_gemm_reshaped) { matrix_b = &_tmp_b; @@ -123,7 +144,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure reshape RHS kernel - _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); } // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 @@ -137,7 +158,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor } // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel.configure(b, &_vector_sum_col); + _mtx_b_reduction_kernel.configure(_convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -161,14 +182,14 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } else { if(_is_midgard) { // Configure matrix multiply kernel - _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } else { @@ -176,13 +197,27 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure matrix multiply kernel - _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } } - // Configure offset contribution kernel + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + + _gemm_output_stage_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + _gemm_output_stage_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0), - _a_offset, _b_offset, gemm_info.gemmlowp_output_stage()); + _a_offset, _b_offset, gemm_info.gemmlowp_output_stage(), &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + + _gemm_output_stage_multipliers.allocator()->allocate(); + _gemm_output_stage_shifts.allocator()->allocate(); + // Compute GEMM output multipliers and shifts for output stage + _gemm_output_stage_multipliers.map(); + _gemm_output_stage_shifts.map(); + std::memcpy(_gemm_output_stage_multipliers.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); + std::memcpy(_gemm_output_stage_shifts.ptr_to_element(Coordinates(0)), gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); + _gemm_output_stage_multipliers.unmap(); + _gemm_output_stage_shifts.unmap(); _mm_result_s32.allocator()->allocate(); } @@ -191,14 +226,14 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor if(_is_gemm_reshaped) { // Configure and tune matrix multiply kernel - _mm_reshaped_only_rhs_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_reshaped_only_rhs_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } else { if(_is_midgard) { // Configure matrix multiply kernel - _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } else { @@ -206,7 +241,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Configure matrix multiply kernel - _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); + _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d)); } } @@ -237,7 +272,15 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + if(b->data_type() == DataType::QSYMM8_PER_CHANNEL) + { + //DataType::QSYMM8_PER_CHANNEL supported only for weights + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() != DataType::QASYMM8, "Matrix A is not quantized while Matrix B is"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + } ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); @@ -245,7 +288,6 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso int32_t b_offset = b->quantization_info().uniform().offset; const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; TensorInfo tmp_b_info{}; GEMMRHSMatrixInfo rhs_info; @@ -266,6 +308,16 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); + bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) + && is_data_type_quantized_asymmetric(a->data_type()); + TensorInfo weights_info(*b); + if(convert_to_qasymm8) + { + b_offset = -128; + weights_info.set_data_type(DataType::QASYMM8); + ARM_COMPUTE_RETURN_ON_ERROR(CLDepthConvertLayerKernel::validate(b, &weights_info, ConvertPolicy::WRAP, 0)); + } + const ITensorInfo *matrix_b_info = &weights_info; if(reshape_matrix_b) { matrix_b_info = &tmp_b_info; @@ -274,8 +326,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso std::tie(lhs_info, rhs_info) = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8); // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); } TensorInfo info_vector_sum_col{}; @@ -284,10 +336,10 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso // Validate matrix B reduction kernel only if _a_offset is not equal to 0 if(a_offset != 0) { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 @@ -332,13 +384,19 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso } // Validate offset contribution kernel + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + + const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, - gemm_info.gemmlowp_output_stage())); + gemm_info.gemmlowp_output_stage(), + &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); } else { @@ -438,6 +496,12 @@ void CLGEMMLowpMatrixMultiplyCore::prepare() { if(!_is_prepared) { + if(_convert_to_qasymm8) + { + _qasymm8_weights.allocator()->allocate(); + CLScheduler::get().enqueue(_weights_to_qasymm8, false); + } + if(_is_gemm_reshaped && _reshape_b_only_on_first_run) { ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); -- cgit v1.2.1