diff options
Diffstat (limited to 'src/cpu/operators/CpuGemm.cpp')
-rw-r--r-- | src/cpu/operators/CpuGemm.cpp | 195 |
1 files changed, 142 insertions, 53 deletions
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp index 8da166dbef..e035de0131 100644 --- a/src/cpu/operators/CpuGemm.cpp +++ b/src/cpu/operators/CpuGemm.cpp @@ -53,6 +53,8 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) asm_info.fast_mode = info.fast_math(); asm_info.fixed_format = info.fixed_format(); asm_info.weight_format = info.weight_format(); + asm_info.transpose_b = + info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method return asm_info; } @@ -72,7 +74,7 @@ void CpuGemm::configure(const ITensorInfo *a, const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); const bool is_c_bias = beta == 1 && c != nullptr; - bool run_optimised = + const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. !(!b->are_values_constant() && @@ -92,14 +94,17 @@ void CpuGemm::configure(const ITensorInfo *a, if (run_optimised) { + _run_interleave_transpose = false; const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); _asm_glue->configure(a, b, c_to_use, d, asm_info); ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); - auto asm_mem_req = _asm_glue->workspace(); - _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; - _aux_mem[Pretraspose] = asm_mem_req[Pretraspose]; + const auto asm_mem_req = _asm_glue->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } // Scale product by alpha if (_run_alpha_scale) @@ -111,37 +116,74 @@ void CpuGemm::configure(const ITensorInfo *a, } else { + _run_interleave_transpose = !_run_vector_matrix_multiplication; // Pick output tensor in case bias addition should be performed ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d; + // Pick b tensor in case pretranspose should be performed + const ITensorInfo *b_to_use = b; _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>(); + // Configure rhs pretranspose + if (gemm_info.pretranspose_B()) + { + _pretranspose_b_func = std::make_unique<CpuTranspose>(); + _pretranspose_b_func->configure(b_to_use, &_pretransposed_b); + MemoryLifetime lifetime; + if (_reshape_b_only_on_first_run) + { + if (_run_interleave_transpose) + { + // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS + // So PreTransposedRHS can be freed inside prepare() + lifetime = MemoryLifetime::Prepare; + } + else + { + // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs + // So PreTransposedRHS needs to persist beyond prepare() + lifetime = MemoryLifetime::Persistent; + } + } + else + { + // PreTransposedRHS tensor is always used in run() and doesn't need to persist + lifetime = MemoryLifetime::Temporary; + } + _aux_mem[PreTransposedRHS] = + MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size()); + b_to_use = &_pretransposed_b; + } + // Select between GEMV and GEMM if (_run_vector_matrix_multiplication) { // Configure the matrix multiply kernel - _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); + _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false); } else { - const int m = a->dimension(1); - const int n = b->dimension(0); - const int k = a->dimension(0); - + ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose); // Configure interleave kernel _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>(); _interleave_kernel->configure(a, &_tmp_a); _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); - // Configure transpose kernel - _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); - _transpose_kernel->configure(b, &_tmp_b); - _aux_mem[TransposedRHS] = - MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + // Configure rhs transpose1xw kernel + _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); + _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b); + _aux_mem[Transposed1xWRHS] = + MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + + // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b + const int m = a->dimension(1); + const int n = b_to_use->dimension(0); + const int k = a->dimension(0); // Configure matrix multiplication kernel - _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); + _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose, + GEMMReshapeInfo(m, n, k)); } if (_run_bias_addition) @@ -179,6 +221,16 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_UNUSED(alpha); const bool is_c_bias = beta == 1 && c != nullptr; const bool run_addition = c != nullptr && beta != 0 && beta != 1; + // Check if we should use the pretransposed_b or original b + // TODO: COMPMID-6597 + // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning + // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision + // We should simplify this by + // 1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch) + // 2. Moving this b_to_use check back into the non-optimized path + TensorInfo pretransposed_b = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b)); + const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b; + // TODO: COMPMID-6597 fix-start ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); @@ -187,16 +239,16 @@ Status CpuGemm::validate(const ITensorInfo *a, if (is_fixed_format_fast_math(gemm_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use); } const int block_by = arm_compute::block_by(gemm_info.weight_format()); // test if im2col has changed the dimensions that are needed for padding - if (a->dimension(0) != b->dimension(1) && block_by > 1) + if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1) { // have to verify bias const size_t dim0_sz = a->dimension(0); @@ -204,18 +256,18 @@ Status CpuGemm::validate(const ITensorInfo *a, (dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right - // b->dimension(1) = kernel_area * input_channel - // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right - const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by; - const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right; + // b_to_use->dimension(1) = kernel_area * input_channel + // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right + const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by; + const size_t kernel_area = (dim0_sz - b_to_use->dimension(1)) / input_pad_right; ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (dim0_sz - kernel_area * input_pad_right) != b->dimension(1), + (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related"); } else { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - a->dimension(0) != b->dimension(1), + a->dimension(0) != b_to_use->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); } @@ -233,14 +285,14 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); } if (d->total_size() != 0) { // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. - ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0)); if (gemm_info.depth_output_gemm3d() != 0) { if (gemm_info.reinterpret_input_as_3d()) @@ -258,10 +310,14 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); } } + // TODO: COMPMID-6597 fix-end // Check if we need to run the optimized assembly kernel cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = + + // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag + // so we pass the original b to CpuGemmAssemblyDispatch + const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. !(!b->are_values_constant() && @@ -277,13 +333,13 @@ Status CpuGemm::validate(const ITensorInfo *a, // Check if the first input tensor is a vector. const bool run_vector_matrix_multiplication = a->dimension(1) < 2; // Check if we need to reshape the matrix A and matrix B - const bool run_interleave_transpose = !run_vector_matrix_multiplication && !b->are_values_constant(); + const bool run_interleave_transpose = !run_vector_matrix_multiplication; // Arguments used by GEMMReshapeInfo // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo // in order to know how the matrices have been reshaped const int m = a->dimension(1); - const int n = b->dimension(0); + const int n = b_to_use->dimension(0); const int k = a->dimension(0); int mult_transpose1xW_width = 1; int mult_interleave4x4_height = 1; @@ -292,7 +348,7 @@ Status CpuGemm::validate(const ITensorInfo *a, m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; + const ITensorInfo *matrix_b_info = b_to_use; TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; @@ -309,9 +365,10 @@ Status CpuGemm::validate(const ITensorInfo *a, ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape( - *b, mult_transpose1xW_width))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); + auto_init_if_empty(tmp_b_info, + b_to_use->clone()->set_tensor_shape( + compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info)); } // Validate matrix multiply @@ -367,29 +424,46 @@ void CpuGemm::run(ITensorPack &tensors) else { CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true); - CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true); + CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true); CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; - if (!_run_vector_matrix_multiplication) + + if (_run_interleave_transpose) { // Run interleave kernel ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack); + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); + } + const ITensor *b_to_use = b; + if (_pretranspose_b_func) + { if (!_reshape_b_only_on_first_run) { - // Run transpose kernel - ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), - transpose_pack); + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); } - - // Use reshaped matrices - mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); - mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get()); + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + if (!_reshape_b_only_on_first_run) + { + // Run transpose1xw kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } + b_to_use = transposed1xw_b.get(); } + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_1, b_to_use); NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, @@ -426,17 +500,32 @@ void CpuGemm::prepare(ITensorPack &tensors) { _asm_glue->prepare(tensors); } - else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) + else if (_reshape_b_only_on_first_run) { - const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); - ITensor *b_aux = - utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS))); - ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux); - - CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux); - ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), - transpose_pack); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *b_to_use = b; + CpuAuxTensorHandler pretransposed_b( + offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors, + false /*pack_inject: no need to inject into tensors*/, + _pretranspose_b_func == + nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, + false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/); + + if (_pretranspose_b_func) + { + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + // Run transpose kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } } _is_prepared = true; } |