Optimize CpuGemmConv2d start-up time

When weight has no holes, we can replace CpuWeightsReshapeKernel with: - Collapse by reinterpreting weight's 3 spatial dimensions - Perform CpuTranspose For more details see the documentation in src/cpu/operators/CpuGemmConv2d.cpp This is one optimization since the CpuTranspose is better performing than CpuWeightsReshapeKernel A second optimization is to fuse this transpose with other weight transformations (e.g. pretranspose_B_array in CpuGemmAssemblyDispatch) However this second optimization depends on how the underlying gemm methods (the fall back path: CpuGemmMatrixMultiplyKernel or the assembly path: CpuGemmAssemblyDispatch) chooses to fuse the transpose. Therefore, this patch moves the transpose down from CpuGemmConv2d, to the individual gemm operators where the fusion decision needs to be made, by passing an extra "transpose_b" flag to CpuGemm New transpose_b flag in different scopes (they are all the same, but with different names because pretranspose_b has a different meaning in GemmAssemblyDispatch): GEMMInfo::pretranspose_B -> AsmGemmInfo::transpose_b New auxilliary tensors holding the transposed b result: - CpuGemm optimized path: CpuGemmAssemblyDispatch::PrePretransposedB - CpuGemm fallback path: CpuGemm::PreTransposedRHS Note that this patch does not yet have the second optimization (COMPMID-6595), but it prepares for it. Relates to COMPMID-6595 Resolves COMPMID-6499 Change-Id: I999a2da9da4b2b15369a3cc06d7872c86e0190ea Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10526 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Anitha Raj <Anitha.Raj@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2023-10-17 17:38:57 +0100
committer: SiCong Li <sicong.li@arm.com> 2023-11-08 09:49:56 +0000
commit: c5ab4df0c11dc66db47f2070edc719923af3367e (patch)
tree: c04bdac32528e628b2a9b9a1c1653e300328fc1b /src/cpu/operators/CpuGemm.cpp
parent: 4a9dbedfbfa66c2612c7461e60cd867b8aea825b (diff)
download: ComputeLibrary-c5ab4df0c11dc66db47f2070edc719923af3367e.tar.gz
1 files changed, 142 insertions, 53 deletions
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 8da166dbef..e035de0131 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -53,6 +53,8 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
     asm_info.fast_mode               = info.fast_math();
     asm_info.fixed_format            = info.fixed_format();
     asm_info.weight_format           = info.weight_format();
+    asm_info.transpose_b =
+        info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method
 
     return asm_info;
 }
@@ -72,7 +74,7 @@ void CpuGemm::configure(const ITensorInfo *a,
 
     const cpu::AsmGemmInfo asm_info  = init_assembly_metadata(gemm_info);
     const bool             is_c_bias = beta == 1 && c != nullptr;
-    bool                   run_optimised =
+    const bool             run_optimised =
         bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
         (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
         !(!b->are_values_constant() &&
@@ -92,14 +94,17 @@ void CpuGemm::configure(const ITensorInfo *a,
 
     if (run_optimised)
     {
+        _run_interleave_transpose   = false;
         const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
         _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
         _asm_glue->configure(a, b, c_to_use, d, asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
 
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretraspose]      = asm_mem_req[Pretraspose];
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
 
         // Scale product by alpha
         if (_run_alpha_scale)
@@ -111,37 +116,74 @@ void CpuGemm::configure(const ITensorInfo *a,
     }
     else
     {
+        _run_interleave_transpose = !_run_vector_matrix_multiplication;
         // Pick output tensor in case bias addition should be performed
         ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
+        // Pick b tensor in case pretranspose should be performed
+        const ITensorInfo *b_to_use = b;
 
         _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
 
+        // Configure rhs pretranspose
+        if (gemm_info.pretranspose_B())
+        {
+            _pretranspose_b_func = std::make_unique<CpuTranspose>();
+            _pretranspose_b_func->configure(b_to_use, &_pretransposed_b);
+            MemoryLifetime lifetime;
+            if (_reshape_b_only_on_first_run)
+            {
+                if (_run_interleave_transpose)
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS
+                    // So PreTransposedRHS can be freed inside prepare()
+                    lifetime = MemoryLifetime::Prepare;
+                }
+                else
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs
+                    // So PreTransposedRHS needs to persist beyond prepare()
+                    lifetime = MemoryLifetime::Persistent;
+                }
+            }
+            else
+            {
+                // PreTransposedRHS tensor is always used in run() and doesn't need to persist
+                lifetime = MemoryLifetime::Temporary;
+            }
+            _aux_mem[PreTransposedRHS] =
+                MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size());
+            b_to_use = &_pretransposed_b;
+        }
+
         // Select between GEMV and GEMM
         if (_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
-            _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
+            _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false);
         }
         else
         {
-            const int m = a->dimension(1);
-            const int n = b->dimension(0);
-            const int k = a->dimension(0);
-
+            ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose);
             // Configure interleave kernel
             _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
             _interleave_kernel->configure(a, &_tmp_a);
             _aux_mem[InterleavedLHS] =
                 MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
 
-            // Configure transpose kernel
-            _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
-            _transpose_kernel->configure(b, &_tmp_b);
-            _aux_mem[TransposedRHS] =
-                MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+            // Configure rhs transpose1xw kernel
+            _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
+            _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b);
+            _aux_mem[Transposed1xWRHS] =
+                MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+
+            // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b
+            const int m = a->dimension(1);
+            const int n = b_to_use->dimension(0);
+            const int k = a->dimension(0);
 
             // Configure matrix multiplication kernel
-            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose,
+                                  GEMMReshapeInfo(m, n, k));
         }
 
         if (_run_bias_addition)
@@ -179,6 +221,16 @@ Status CpuGemm::validate(const ITensorInfo *a,
     ARM_COMPUTE_UNUSED(alpha);
     const bool is_c_bias    = beta == 1 && c != nullptr;
     const bool run_addition = c != nullptr && beta != 0 && beta != 1;
+    // Check if we should use the pretransposed_b or original b
+    // TODO: COMPMID-6597
+    // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning
+    // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision
+    // We should simplify this by
+    //   1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch)
+    //   2. Moving this b_to_use check back into the non-optimized path
+    TensorInfo pretransposed_b  = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b));
+    const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b;
+    // TODO: COMPMID-6597 fix-start
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
@@ -187,16 +239,16 @@ Status CpuGemm::validate(const ITensorInfo *a,
     if (is_fixed_format_fast_math(gemm_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use);
     }
 
     const int block_by = arm_compute::block_by(gemm_info.weight_format());
     // test if im2col has changed the dimensions that are needed for padding
-    if (a->dimension(0) != b->dimension(1) && block_by > 1)
+    if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1)
     {
         // have to verify bias
         const size_t dim0_sz = a->dimension(0);
@@ -204,18 +256,18 @@ Status CpuGemm::validate(const ITensorInfo *a,
             (dim0_sz % block_by) != 0,
             ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
         // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
-        // b->dimension(1) = kernel_area * input_channel
-        // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
-        const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
-        const size_t kernel_area     = (dim0_sz - b->dimension(1)) / input_pad_right;
+        // b_to_use->dimension(1) = kernel_area * input_channel
+        // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right
+        const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by;
+        const size_t kernel_area     = (dim0_sz - b_to_use->dimension(1)) / input_pad_right;
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+            (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1),
             "The product AB is defined only if A number of columns and B number of rows are related");
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            a->dimension(0) != b->dimension(1),
+            a->dimension(0) != b_to_use->dimension(1),
             "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     }
 
@@ -233,14 +285,14 @@ Status CpuGemm::validate(const ITensorInfo *a,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
                                         "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0),
                                         "The C matrix must have the same number of columns as the matrix B");
     }
 
     if (d->total_size() != 0)
     {
         // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
-        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0));
         if (gemm_info.depth_output_gemm3d() != 0)
         {
             if (gemm_info.reinterpret_input_as_3d())
@@ -258,10 +310,14 @@ Status CpuGemm::validate(const ITensorInfo *a,
             ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
         }
     }
+    // TODO: COMPMID-6597 fix-end
 
     // Check if we need to run the optimized assembly kernel
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
-    const bool       run_optimised =
+
+    // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag
+    // so we pass the original b to CpuGemmAssemblyDispatch
+    const bool run_optimised =
         bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
         (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
         !(!b->are_values_constant() &&
@@ -277,13 +333,13 @@ Status CpuGemm::validate(const ITensorInfo *a,
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         // Check if we need to reshape the matrix A and matrix B
-        const bool run_interleave_transpose = !run_vector_matrix_multiplication && !b->are_values_constant();
+        const bool run_interleave_transpose = !run_vector_matrix_multiplication;
 
         // Arguments used by GEMMReshapeInfo
         // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
         // in order to know how the matrices have been reshaped
         const int m                         = a->dimension(1);
-        const int n                         = b->dimension(0);
+        const int n                         = b_to_use->dimension(0);
         const int k                         = a->dimension(0);
         int       mult_transpose1xW_width   = 1;
         int       mult_interleave4x4_height = 1;
@@ -292,7 +348,7 @@ Status CpuGemm::validate(const ITensorInfo *a,
             m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
 
         const ITensorInfo *matrix_a_info = a;
-        const ITensorInfo *matrix_b_info = b;
+        const ITensorInfo *matrix_b_info = b_to_use;
 
         TensorInfo tmp_a_info{};
         TensorInfo tmp_b_info{};
@@ -309,9 +365,10 @@ Status CpuGemm::validate(const ITensorInfo *a,
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
 
             // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
-                                               *b, mult_transpose1xW_width)));
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+            auto_init_if_empty(tmp_b_info,
+                               b_to_use->clone()->set_tensor_shape(
+                                   compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width)));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info));
         }
 
         // Validate matrix multiply
@@ -367,29 +424,46 @@ void CpuGemm::run(ITensorPack &tensors)
     else
     {
         CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
-        CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
+        CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors);
+        CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true);
         CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
 
         ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
-        if (!_run_vector_matrix_multiplication)
+
+        if (_run_interleave_transpose)
         {
             // Run interleave kernel
             ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
             NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
                                            interleave_pack);
+            // Use reshaped matrices
+            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
+        }
 
+        const ITensor *b_to_use = b;
+        if (_pretranspose_b_func)
+        {
             if (!_reshape_b_only_on_first_run)
             {
-                // Run transpose kernel
-                ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
-                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
-                                               transpose_pack);
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
             }
-
-            // Use reshaped matrices
-            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
-            mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
+            b_to_use = pretransposed_b.get();
+        }
+        if (_run_interleave_transpose)
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose1xw kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
+            b_to_use = transposed1xw_b.get();
         }
+        // Use reshaped matrices
+        mm_pack.add_const_tensor(ACL_SRC_1, b_to_use);
 
         NEScheduler::get().schedule_op(_mm_kernel.get(),
                                        _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
@@ -426,17 +500,32 @@ void CpuGemm::prepare(ITensorPack &tensors)
         {
             _asm_glue->prepare(tensors);
         }
-        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+        else if (_reshape_b_only_on_first_run)
         {
-            const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
-            ITensor       *b_aux =
-                utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
-            ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
-
-            CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
-            ITensorPack         transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
-            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
-                                           transpose_pack);
+            const ITensor      *b        = tensors.get_const_tensor(ACL_SRC_1);
+            const ITensor      *b_to_use = b;
+            CpuAuxTensorHandler pretransposed_b(
+                offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors,
+                false /*pack_inject: no need to inject into tensors*/,
+                _pretranspose_b_func ==
+                    nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/);
+            CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors,
+                                                false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/);
+
+            if (_pretranspose_b_func)
+            {
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
+                b_to_use = pretransposed_b.get();
+            }
+            if (_run_interleave_transpose)
+            {
+                // Run transpose kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
         }
         _is_prepared = true;
     }
author	SiCong Li <sicong.li@arm.com>	2023-10-17 17:38:57 +0100
committer	SiCong Li <sicong.li@arm.com>	2023-11-08 09:49:56 +0000
commit	c5ab4df0c11dc66db47f2070edc719923af3367e (patch)
tree	c04bdac32528e628b2a9b9a1c1653e300328fc1b /src/cpu/operators/CpuGemm.cpp
parent	4a9dbedfbfa66c2612c7461e60cd867b8aea825b (diff)
download	ComputeLibrary-c5ab4df0c11dc66db47f2070edc719923af3367e.tar.gz