From d39e2b1e0be12420b1e00279ebee0c34bae3dd8c Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 6 Aug 2018 14:31:15 +0100 Subject: COMPMID-1188 - Fixed performance degradation with GEMM3D The previous implementation of GEMM3D degradated the performance when the input had to be reinterpreted as 3D. However if both input and output have to be reinterpreted as 3D, we can skip the offset calculation for that specific case and run the multi GEMM approach Change-Id: I0d5d48add2c6ccdebfbb268ea199dd181101f3aa Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/142872 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- examples/graph_squeezenet_v1_1.cpp | 2 -- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 23 ++++++++++++++++++++-- tests/datasets/LargeGEMMDataset.h | 8 ++++---- tests/datasets/SmallGEMMDataset.h | 2 +- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/examples/graph_squeezenet_v1_1.cpp b/examples/graph_squeezenet_v1_1.cpp index bf6324698f..e52062927b 100644 --- a/examples/graph_squeezenet_v1_1.cpp +++ b/examples/graph_squeezenet_v1_1.cpp @@ -87,7 +87,6 @@ public: graph << common_params.target << common_params.fast_math_hint << InputLayer(input_descriptor, get_input_accessor(common_params, std::move(preprocessor))) - << ConvolutionMethod::Direct << ConvolutionLayer( 3U, 3U, 64U, get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/conv1_w.npy", weights_layout), @@ -95,7 +94,6 @@ public: PadStrideInfo(2, 2, 0, 0)) << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)) << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))) - << ConvolutionMethod::Default << ConvolutionLayer( 1U, 1U, 16U, get_weights_accessor(data_path, "/cnn_data/squeezenet_v1_1_model/fire2_squeeze1x1_w.npy", weights_layout), diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index c9e6bb34b2..79e2f8b11a 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -57,6 +57,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D"); if(!is_interleaved_transposed) { @@ -109,13 +110,23 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu const DataType data_type = input0->data_type(); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); + bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1); + + // In case both input and output have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_input_as_3d = false; + reinterpret_output_as_3d = false; + } // Output tensor auto inizialitation if not yet initialized auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info))); TensorInfo tmp_info(*output); - if(reshape_info.depth_output_gemm3d() != 1) + if(reinterpret_output_as_3d) { // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -162,7 +173,7 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic - const int m = reshape_info.reinterpret_input_as_3d() ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1]; + const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1]; const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y; // Create kernels according to the architecture, data type and input size. @@ -219,6 +230,14 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1); + // In case both input and output have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + // Check if we need to slide the matrix B const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions(); diff --git a/tests/datasets/LargeGEMMDataset.h b/tests/datasets/LargeGEMMDataset.h index 9c45cb0657..bbf362ca21 100644 --- a/tests/datasets/LargeGEMMDataset.h +++ b/tests/datasets/LargeGEMMDataset.h @@ -71,11 +71,11 @@ public: { add_config(TensorShape(923U, 143U, 3U), TensorShape(871U, 923U), TensorShape(871U, 143U, 3U), TensorShape(871U, 143U, 3U), 1.0f, 0.0f); add_config(TensorShape(681U, 205U, 5U), TensorShape(213U, 681U), TensorShape(213U, 205U, 5U), TensorShape(213U, 205U, 5U), 1.0f, 0.0f); - add_config(TensorShape(364U, 605U, 5U), TensorShape(96U, 364U), TensorShape(96U, 605U, 5U), TensorShape(96U, 605U, 5U), 1.0f, 0.0f); + add_config(TensorShape(364U, 605U, 5U), TensorShape(96U, 364U), TensorShape(96U, 605U, 5U), TensorShape(96U, 605U, 5U), 0.2f, 1.2f); add_config(TensorShape(1201U, 243U, 3U), TensorShape(128U, 1201U), TensorShape(128U, 243U, 3U), TensorShape(128U, 243U, 3U), 1.0f, 0.0f); - add_config(TensorShape(2305U, 13U, 13U), TensorShape(384U, 2305U), TensorShape(384U, 13U, 13U), TensorShape(384U, 13U, 13U), 1.0f, 0.0f); - add_config(TensorShape(1729U, 85U, 2U, 2U), TensorShape(192U, 1729U, 2U), TensorShape(192U, 85U, 2U, 2U), TensorShape(192U, 85U, 2U, 2U), 1.0f, 0.0f); - add_config(TensorShape(1729U, 17U, 10U, 3U), TensorShape(128U, 1729U, 3U), TensorShape(128U, 17U, 10U, 3U), TensorShape(128U, 17U, 10U, 3U), 1.0f, 0.0f); + add_config(TensorShape(2305U, 13U, 13U), TensorShape(384U, 2305U), TensorShape(384U, 13U, 13U), TensorShape(384U, 13U, 13U), 0.4f, 0.7f); + add_config(TensorShape(1729U, 85U, 2U, 2U), TensorShape(192U, 1729U), TensorShape(192U, 85U, 2U, 2U), TensorShape(192U, 85U, 2U, 2U), 1.0f, 0.0f); + add_config(TensorShape(1729U, 17U, 10U, 3U), TensorShape(128U, 1729U), TensorShape(128U, 17U, 10U, 3U), TensorShape(128U, 17U, 10U, 3U), 1.0f, 0.3f); } }; } // namespace datasets diff --git a/tests/datasets/SmallGEMMDataset.h b/tests/datasets/SmallGEMMDataset.h index e108fcc1ca..15a3504e85 100644 --- a/tests/datasets/SmallGEMMDataset.h +++ b/tests/datasets/SmallGEMMDataset.h @@ -73,7 +73,7 @@ public: add_config(TensorShape(38U, 12U, 2U), TensorShape(21U, 38U), TensorShape(21U, 12U, 2U), TensorShape(21U, 12U, 2U), 0.2f, 1.2f); add_config(TensorShape(32U, 1U, 4U, 3U), TensorShape(17U, 32U), TensorShape(17U, 1U, 4U, 3U), TensorShape(17U, 1U, 4U, 3U), 0.4f, 0.7f); add_config(TensorShape(16U, 16U, 3U, 2U), TensorShape(8U, 16U), TensorShape(8U, 16U, 3U, 2U), TensorShape(8U, 16U, 3U, 2U), 1.0f, 0.0f); - add_config(TensorShape(16U, 16U, 5U, 3U), TensorShape(8U, 16U, 3U), TensorShape(8U, 16U, 5U, 3U), TensorShape(8U, 16U, 5U, 3U), 1.0f, 0.0f); + add_config(TensorShape(16U, 16U, 5U, 3U), TensorShape(8U, 16U), TensorShape(8U, 16U, 5U, 3U), TensorShape(8U, 16U, 5U, 3U), 1.0f, 0.3f); } }; } // namespace datasets -- cgit v1.2.1