From c5ab4df0c11dc66db47f2070edc719923af3367e Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Tue, 17 Oct 2023 17:38:57 +0100 Subject: Optimize CpuGemmConv2d start-up time When weight has no holes, we can replace CpuWeightsReshapeKernel with: - Collapse by reinterpreting weight's 3 spatial dimensions - Perform CpuTranspose For more details see the documentation in src/cpu/operators/CpuGemmConv2d.cpp This is one optimization since the CpuTranspose is better performing than CpuWeightsReshapeKernel A second optimization is to fuse this transpose with other weight transformations (e.g. pretranspose_B_array in CpuGemmAssemblyDispatch) However this second optimization depends on how the underlying gemm methods (the fall back path: CpuGemmMatrixMultiplyKernel or the assembly path: CpuGemmAssemblyDispatch) chooses to fuse the transpose. Therefore, this patch moves the transpose down from CpuGemmConv2d, to the individual gemm operators where the fusion decision needs to be made, by passing an extra "transpose_b" flag to CpuGemm New transpose_b flag in different scopes (they are all the same, but with different names because pretranspose_b has a different meaning in GemmAssemblyDispatch): GEMMInfo::pretranspose_B -> AsmGemmInfo::transpose_b New auxilliary tensors holding the transposed b result: - CpuGemm optimized path: CpuGemmAssemblyDispatch::PrePretransposedB - CpuGemm fallback path: CpuGemm::PreTransposedRHS Note that this patch does not yet have the second optimization (COMPMID-6595), but it prepares for it. Relates to COMPMID-6595 Resolves COMPMID-6499 Change-Id: I999a2da9da4b2b15369a3cc06d7872c86e0190ea Signed-off-by: SiCong Li Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10526 Tested-by: Arm Jenkins Reviewed-by: Anitha Raj Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/helpers/Utils.cpp | 5 +++++ src/core/helpers/Utils.h | 12 ++++++++++++ 2 files changed, 17 insertions(+) (limited to 'src/core') diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp index 6ca29d180d..f8895d8a3c 100644 --- a/src/core/helpers/Utils.cpp +++ b/src/core/helpers/Utils.cpp @@ -25,6 +25,11 @@ namespace arm_compute { +bool has_holes(const ITensorInfo &info) +{ + return has_holes(info, info.num_dimensions() - 1); +} + bool has_holes(const ITensorInfo &info, size_t dimension) { const auto &shape = info.tensor_shape(); diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h index 2e7224c55b..a17a78f7ee 100644 --- a/src/core/helpers/Utils.h +++ b/src/core/helpers/Utils.h @@ -93,6 +93,18 @@ inline unsigned int get_next_power_two(unsigned int x) return x; } +/** Check if the tensor has any holes. + * + * A hole is defined as any gap in the tensor between two consecutive values. This can be a result of extending + * the paddings or manipulating the strides of the tensor + * + * @param[in] info Tensor info object defining the shape of the input tensor. + * + * @note This function checks for holes in all dimensions. + * + */ +bool has_holes(const ITensorInfo &info); + /** Check if the tensor has any holes. * * @param[in] info Tensor info object defining the shape of the input tensor. -- cgit v1.2.1