aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/operators/CpuGemm.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/operators/CpuGemm.cpp')
-rw-r--r--src/cpu/operators/CpuGemm.cpp195
1 files changed, 142 insertions, 53 deletions
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 8da166dbef..e035de0131 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -53,6 +53,8 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
asm_info.fast_mode = info.fast_math();
asm_info.fixed_format = info.fixed_format();
asm_info.weight_format = info.weight_format();
+ asm_info.transpose_b =
+ info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method
return asm_info;
}
@@ -72,7 +74,7 @@ void CpuGemm::configure(const ITensorInfo *a,
const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
const bool is_c_bias = beta == 1 && c != nullptr;
- bool run_optimised =
+ const bool run_optimised =
bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
(c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
!(!b->are_values_constant() &&
@@ -92,14 +94,17 @@ void CpuGemm::configure(const ITensorInfo *a,
if (run_optimised)
{
+ _run_interleave_transpose = false;
const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
_asm_glue->configure(a, b, c_to_use, d, asm_info);
ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
- auto asm_mem_req = _asm_glue->workspace();
- _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
- _aux_mem[Pretraspose] = asm_mem_req[Pretraspose];
+ const auto asm_mem_req = _asm_glue->workspace();
+ for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+ {
+ _aux_mem[slot] = asm_mem_req[slot];
+ }
// Scale product by alpha
if (_run_alpha_scale)
@@ -111,37 +116,74 @@ void CpuGemm::configure(const ITensorInfo *a,
}
else
{
+ _run_interleave_transpose = !_run_vector_matrix_multiplication;
// Pick output tensor in case bias addition should be performed
ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
+ // Pick b tensor in case pretranspose should be performed
+ const ITensorInfo *b_to_use = b;
_mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
+ // Configure rhs pretranspose
+ if (gemm_info.pretranspose_B())
+ {
+ _pretranspose_b_func = std::make_unique<CpuTranspose>();
+ _pretranspose_b_func->configure(b_to_use, &_pretransposed_b);
+ MemoryLifetime lifetime;
+ if (_reshape_b_only_on_first_run)
+ {
+ if (_run_interleave_transpose)
+ {
+ // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS
+ // So PreTransposedRHS can be freed inside prepare()
+ lifetime = MemoryLifetime::Prepare;
+ }
+ else
+ {
+ // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs
+ // So PreTransposedRHS needs to persist beyond prepare()
+ lifetime = MemoryLifetime::Persistent;
+ }
+ }
+ else
+ {
+ // PreTransposedRHS tensor is always used in run() and doesn't need to persist
+ lifetime = MemoryLifetime::Temporary;
+ }
+ _aux_mem[PreTransposedRHS] =
+ MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size());
+ b_to_use = &_pretransposed_b;
+ }
+
// Select between GEMV and GEMM
if (_run_vector_matrix_multiplication)
{
// Configure the matrix multiply kernel
- _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
+ _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false);
}
else
{
- const int m = a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
-
+ ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose);
// Configure interleave kernel
_interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
_interleave_kernel->configure(a, &_tmp_a);
_aux_mem[InterleavedLHS] =
MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
- // Configure transpose kernel
- _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
- _transpose_kernel->configure(b, &_tmp_b);
- _aux_mem[TransposedRHS] =
- MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+ // Configure rhs transpose1xw kernel
+ _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
+ _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b);
+ _aux_mem[Transposed1xWRHS] =
+ MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+
+ // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b
+ const int m = a->dimension(1);
+ const int n = b_to_use->dimension(0);
+ const int k = a->dimension(0);
// Configure matrix multiplication kernel
- _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+ _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose,
+ GEMMReshapeInfo(m, n, k));
}
if (_run_bias_addition)
@@ -179,6 +221,16 @@ Status CpuGemm::validate(const ITensorInfo *a,
ARM_COMPUTE_UNUSED(alpha);
const bool is_c_bias = beta == 1 && c != nullptr;
const bool run_addition = c != nullptr && beta != 0 && beta != 1;
+ // Check if we should use the pretransposed_b or original b
+ // TODO: COMPMID-6597
+ // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning
+ // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision
+ // We should simplify this by
+ // 1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch)
+ // 2. Moving this b_to_use check back into the non-optimized path
+ TensorInfo pretransposed_b = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b));
+ const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b;
+ // TODO: COMPMID-6597 fix-start
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
@@ -187,16 +239,16 @@ Status CpuGemm::validate(const ITensorInfo *a,
if (is_fixed_format_fast_math(gemm_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16);
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use);
}
const int block_by = arm_compute::block_by(gemm_info.weight_format());
// test if im2col has changed the dimensions that are needed for padding
- if (a->dimension(0) != b->dimension(1) && block_by > 1)
+ if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1)
{
// have to verify bias
const size_t dim0_sz = a->dimension(0);
@@ -204,18 +256,18 @@ Status CpuGemm::validate(const ITensorInfo *a,
(dim0_sz % block_by) != 0,
("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
// a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
- // b->dimension(1) = kernel_area * input_channel
- // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
- const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
- const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right;
+ // b_to_use->dimension(1) = kernel_area * input_channel
+ // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right
+ const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by;
+ const size_t kernel_area = (dim0_sz - b_to_use->dimension(1)) / input_pad_right;
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+ (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1),
"The product AB is defined only if A number of columns and B number of rows are related");
}
else
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- a->dimension(0) != b->dimension(1),
+ a->dimension(0) != b_to_use->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
}
@@ -233,14 +285,14 @@ Status CpuGemm::validate(const ITensorInfo *a,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
"The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0),
"The C matrix must have the same number of columns as the matrix B");
}
if (d->total_size() != 0)
{
// For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
- ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0));
if (gemm_info.depth_output_gemm3d() != 0)
{
if (gemm_info.reinterpret_input_as_3d())
@@ -258,10 +310,14 @@ Status CpuGemm::validate(const ITensorInfo *a,
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
}
}
+ // TODO: COMPMID-6597 fix-end
// Check if we need to run the optimized assembly kernel
cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool run_optimised =
+
+ // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag
+ // so we pass the original b to CpuGemmAssemblyDispatch
+ const bool run_optimised =
bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
(c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
!(!b->are_values_constant() &&
@@ -277,13 +333,13 @@ Status CpuGemm::validate(const ITensorInfo *a,
// Check if the first input tensor is a vector.
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
// Check if we need to reshape the matrix A and matrix B
- const bool run_interleave_transpose = !run_vector_matrix_multiplication && !b->are_values_constant();
+ const bool run_interleave_transpose = !run_vector_matrix_multiplication;
// Arguments used by GEMMReshapeInfo
// If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
// in order to know how the matrices have been reshaped
const int m = a->dimension(1);
- const int n = b->dimension(0);
+ const int n = b_to_use->dimension(0);
const int k = a->dimension(0);
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
@@ -292,7 +348,7 @@ Status CpuGemm::validate(const ITensorInfo *a,
m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
+ const ITensorInfo *matrix_b_info = b_to_use;
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
@@ -309,9 +365,10 @@ Status CpuGemm::validate(const ITensorInfo *a,
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
// Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
- *b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+ auto_init_if_empty(tmp_b_info,
+ b_to_use->clone()->set_tensor_shape(
+ compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info));
}
// Validate matrix multiply
@@ -367,29 +424,46 @@ void CpuGemm::run(ITensorPack &tensors)
else
{
CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
- CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
+ CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors);
+ CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true);
CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
- if (!_run_vector_matrix_multiplication)
+
+ if (_run_interleave_transpose)
{
// Run interleave kernel
ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
interleave_pack);
+ // Use reshaped matrices
+ mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
+ }
+ const ITensor *b_to_use = b;
+ if (_pretranspose_b_func)
+ {
if (!_reshape_b_only_on_first_run)
{
- // Run transpose kernel
- ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
- NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
- transpose_pack);
+ // Run pretranspose kernel
+ ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+ _pretranspose_b_func->run(pretranspose_pack);
}
-
- // Use reshaped matrices
- mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
- mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
+ b_to_use = pretransposed_b.get();
+ }
+ if (_run_interleave_transpose)
+ {
+ if (!_reshape_b_only_on_first_run)
+ {
+ // Run transpose1xw kernel
+ ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+ NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+ _transpose1xW_b_kernel->window(), transpose_pack);
+ }
+ b_to_use = transposed1xw_b.get();
}
+ // Use reshaped matrices
+ mm_pack.add_const_tensor(ACL_SRC_1, b_to_use);
NEScheduler::get().schedule_op(_mm_kernel.get(),
_run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
@@ -426,17 +500,32 @@ void CpuGemm::prepare(ITensorPack &tensors)
{
_asm_glue->prepare(tensors);
}
- else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+ else if (_reshape_b_only_on_first_run)
{
- const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *b_aux =
- utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
- ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
-
- CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
- ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
- NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
- transpose_pack);
+ const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+ const ITensor *b_to_use = b;
+ CpuAuxTensorHandler pretransposed_b(
+ offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors,
+ false /*pack_inject: no need to inject into tensors*/,
+ _pretranspose_b_func ==
+ nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/);
+ CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors,
+ false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/);
+
+ if (_pretranspose_b_func)
+ {
+ // Run pretranspose kernel
+ ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+ _pretranspose_b_func->run(pretranspose_pack);
+ b_to_use = pretransposed_b.get();
+ }
+ if (_run_interleave_transpose)
+ {
+ // Run transpose kernel
+ ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+ NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+ _transpose1xW_b_kernel->window(), transpose_pack);
+ }
}
_is_prepared = true;
}