diff options
author | SiCong Li <sicong.li@arm.com> | 2023-10-17 17:38:57 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2023-11-08 09:49:56 +0000 |
commit | c5ab4df0c11dc66db47f2070edc719923af3367e (patch) | |
tree | c04bdac32528e628b2a9b9a1c1653e300328fc1b /src/cpu/operators/CpuGemm.cpp | |
parent | 4a9dbedfbfa66c2612c7461e60cd867b8aea825b (diff) | |
download | ComputeLibrary-c5ab4df0c11dc66db47f2070edc719923af3367e.tar.gz |
Optimize CpuGemmConv2d start-up time
When weight has no holes, we can replace CpuWeightsReshapeKernel with:
- Collapse by reinterpreting weight's 3 spatial dimensions
- Perform CpuTranspose
For more details see the documentation in
src/cpu/operators/CpuGemmConv2d.cpp
This is one optimization since the CpuTranspose is better performing
than CpuWeightsReshapeKernel
A second optimization is to fuse this transpose with other weight
transformations (e.g. pretranspose_B_array in CpuGemmAssemblyDispatch)
However this second optimization depends on how the underlying gemm
methods (the fall back path: CpuGemmMatrixMultiplyKernel or the assembly
path: CpuGemmAssemblyDispatch) chooses to fuse the transpose.
Therefore, this patch moves the transpose down from CpuGemmConv2d, to
the individual gemm operators where the fusion decision needs to be
made, by passing an extra "transpose_b" flag to CpuGemm
New transpose_b flag in different scopes (they are all the same, but
with different names because pretranspose_b has a different meaning in
GemmAssemblyDispatch):
GEMMInfo::pretranspose_B -> AsmGemmInfo::transpose_b
New auxilliary tensors holding the transposed b result:
- CpuGemm optimized path: CpuGemmAssemblyDispatch::PrePretransposedB
- CpuGemm fallback path: CpuGemm::PreTransposedRHS
Note that this patch does not yet have the second optimization
(COMPMID-6595), but it prepares for it.
Relates to COMPMID-6595
Resolves COMPMID-6499
Change-Id: I999a2da9da4b2b15369a3cc06d7872c86e0190ea
Signed-off-by: SiCong Li <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10526
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/operators/CpuGemm.cpp')
-rw-r--r-- | src/cpu/operators/CpuGemm.cpp | 195 |
1 files changed, 142 insertions, 53 deletions
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp index 8da166dbef..e035de0131 100644 --- a/src/cpu/operators/CpuGemm.cpp +++ b/src/cpu/operators/CpuGemm.cpp @@ -53,6 +53,8 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) asm_info.fast_mode = info.fast_math(); asm_info.fixed_format = info.fixed_format(); asm_info.weight_format = info.weight_format(); + asm_info.transpose_b = + info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method return asm_info; } @@ -72,7 +74,7 @@ void CpuGemm::configure(const ITensorInfo *a, const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); const bool is_c_bias = beta == 1 && c != nullptr; - bool run_optimised = + const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. !(!b->are_values_constant() && @@ -92,14 +94,17 @@ void CpuGemm::configure(const ITensorInfo *a, if (run_optimised) { + _run_interleave_transpose = false; const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); _asm_glue->configure(a, b, c_to_use, d, asm_info); ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); - auto asm_mem_req = _asm_glue->workspace(); - _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; - _aux_mem[Pretraspose] = asm_mem_req[Pretraspose]; + const auto asm_mem_req = _asm_glue->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } // Scale product by alpha if (_run_alpha_scale) @@ -111,37 +116,74 @@ void CpuGemm::configure(const ITensorInfo *a, } else { + _run_interleave_transpose = !_run_vector_matrix_multiplication; // Pick output tensor in case bias addition should be performed ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d; + // Pick b tensor in case pretranspose should be performed + const ITensorInfo *b_to_use = b; _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>(); + // Configure rhs pretranspose + if (gemm_info.pretranspose_B()) + { + _pretranspose_b_func = std::make_unique<CpuTranspose>(); + _pretranspose_b_func->configure(b_to_use, &_pretransposed_b); + MemoryLifetime lifetime; + if (_reshape_b_only_on_first_run) + { + if (_run_interleave_transpose) + { + // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS + // So PreTransposedRHS can be freed inside prepare() + lifetime = MemoryLifetime::Prepare; + } + else + { + // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs + // So PreTransposedRHS needs to persist beyond prepare() + lifetime = MemoryLifetime::Persistent; + } + } + else + { + // PreTransposedRHS tensor is always used in run() and doesn't need to persist + lifetime = MemoryLifetime::Temporary; + } + _aux_mem[PreTransposedRHS] = + MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size()); + b_to_use = &_pretransposed_b; + } + // Select between GEMV and GEMM if (_run_vector_matrix_multiplication) { // Configure the matrix multiply kernel - _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); + _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false); } else { - const int m = a->dimension(1); - const int n = b->dimension(0); - const int k = a->dimension(0); - + ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose); // Configure interleave kernel _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>(); _interleave_kernel->configure(a, &_tmp_a); _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); - // Configure transpose kernel - _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); - _transpose_kernel->configure(b, &_tmp_b); - _aux_mem[TransposedRHS] = - MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + // Configure rhs transpose1xw kernel + _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); + _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b); + _aux_mem[Transposed1xWRHS] = + MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + + // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b + const int m = a->dimension(1); + const int n = b_to_use->dimension(0); + const int k = a->dimension(0); // Configure matrix multiplication kernel - _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); + _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose, + GEMMReshapeInfo(m, n, k)); } if (_run_bias_addition) @@ -179,6 +221,16 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_UNUSED(alpha); const bool is_c_bias = beta == 1 && c != nullptr; const bool run_addition = c != nullptr && beta != 0 && beta != 1; + // Check if we should use the pretransposed_b or original b + // TODO: COMPMID-6597 + // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning + // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision + // We should simplify this by + // 1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch) + // 2. Moving this b_to_use check back into the non-optimized path + TensorInfo pretransposed_b = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b)); + const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b; + // TODO: COMPMID-6597 fix-start ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); @@ -187,16 +239,16 @@ Status CpuGemm::validate(const ITensorInfo *a, if (is_fixed_format_fast_math(gemm_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use); } const int block_by = arm_compute::block_by(gemm_info.weight_format()); // test if im2col has changed the dimensions that are needed for padding - if (a->dimension(0) != b->dimension(1) && block_by > 1) + if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1) { // have to verify bias const size_t dim0_sz = a->dimension(0); @@ -204,18 +256,18 @@ Status CpuGemm::validate(const ITensorInfo *a, (dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right - // b->dimension(1) = kernel_area * input_channel - // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right - const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by; - const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right; + // b_to_use->dimension(1) = kernel_area * input_channel + // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right + const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by; + const size_t kernel_area = (dim0_sz - b_to_use->dimension(1)) / input_pad_right; ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (dim0_sz - kernel_area * input_pad_right) != b->dimension(1), + (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related"); } else { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - a->dimension(0) != b->dimension(1), + a->dimension(0) != b_to_use->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); } @@ -233,14 +285,14 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); } if (d->total_size() != 0) { // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. - ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0)); if (gemm_info.depth_output_gemm3d() != 0) { if (gemm_info.reinterpret_input_as_3d()) @@ -258,10 +310,14 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); } } + // TODO: COMPMID-6597 fix-end // Check if we need to run the optimized assembly kernel cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = + + // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag + // so we pass the original b to CpuGemmAssemblyDispatch + const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. !(!b->are_values_constant() && @@ -277,13 +333,13 @@ Status CpuGemm::validate(const ITensorInfo *a, // Check if the first input tensor is a vector. const bool run_vector_matrix_multiplication = a->dimension(1) < 2; // Check if we need to reshape the matrix A and matrix B - const bool run_interleave_transpose = !run_vector_matrix_multiplication && !b->are_values_constant(); + const bool run_interleave_transpose = !run_vector_matrix_multiplication; // Arguments used by GEMMReshapeInfo // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo // in order to know how the matrices have been reshaped const int m = a->dimension(1); - const int n = b->dimension(0); + const int n = b_to_use->dimension(0); const int k = a->dimension(0); int mult_transpose1xW_width = 1; int mult_interleave4x4_height = 1; @@ -292,7 +348,7 @@ Status CpuGemm::validate(const ITensorInfo *a, m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; + const ITensorInfo *matrix_b_info = b_to_use; TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; @@ -309,9 +365,10 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape( - *b, mult_transpose1xW_width))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); + auto_init_if_empty(tmp_b_info, + b_to_use->clone()->set_tensor_shape( + compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info)); } // Validate matrix multiply @@ -367,29 +424,46 @@ void CpuGemm::run(ITensorPack &tensors) else { CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true); - CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true); + CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true); CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; - if (!_run_vector_matrix_multiplication) + + if (_run_interleave_transpose) { // Run interleave kernel ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack); + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); + } + const ITensor *b_to_use = b; + if (_pretranspose_b_func) + { if (!_reshape_b_only_on_first_run) { - // Run transpose kernel - ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), - transpose_pack); + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); } - - // Use reshaped matrices - mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); - mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get()); + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + if (!_reshape_b_only_on_first_run) + { + // Run transpose1xw kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } + b_to_use = transposed1xw_b.get(); } + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_1, b_to_use); NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, @@ -426,17 +500,32 @@ void CpuGemm::prepare(ITensorPack &tensors) { _asm_glue->prepare(tensors); } - else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) + else if (_reshape_b_only_on_first_run) { - const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); - ITensor *b_aux = - utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS))); - ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux); - - CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux); - ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), - transpose_pack); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *b_to_use = b; + CpuAuxTensorHandler pretransposed_b( + offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors, + false /*pack_inject: no need to inject into tensors*/, + _pretranspose_b_func == + nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, + false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/); + + if (_pretranspose_b_func) + { + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + // Run transpose kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } } _is_prepared = true; } |