diff options
author | Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com> | 2022-09-29 13:07:21 +0100 |
---|---|---|
committer | Mohmun02 <MohammedSuhail.Munshi@arm.com> | 2022-10-20 08:36:41 +0000 |
commit | 6782452c16a286a6dd4a071cfc70bbbcbabb20be (patch) | |
tree | 601a9d2ba302ed3c93ca21dab8e8dc28ce9c7a04 /src/cpu/kernels | |
parent | 0a36f58e711a19040b1b65ef0afa83cce0c9dc4c (diff) | |
download | ComputeLibrary-6782452c16a286a6dd4a071cfc70bbbcbabb20be.tar.gz |
Add test in GEMMLowp for batch matmul
- Adds tests for batched matrix multiplication
- Bugfix for issue : 3d tensors input tensors with offsets in GemmLowp results in mismatches
Resolves : [COMPMID-5507]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I68e036fbca642c1841dd4321033045aadc8f5636
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/461298
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8482
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/kernels')
-rw-r--r-- | src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp | 20 |
1 files changed, 12 insertions, 8 deletions
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp index a9896772f6..a65f1a33de 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -108,7 +108,9 @@ void run_offset_contribution(const Window &window, const int window_end_x = window.x().end(); const int window_step_x = 16; - Iterator mm_result_it(mm_result, collapsed_window); + // if vector_sum_col is nullptr then stride_y is 0, else get stride_y + const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0; + Iterator mm_result_it(mm_result, collapsed_window); if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true { @@ -133,9 +135,10 @@ void run_offset_contribution(const Window &window, execute_window_loop(collapsed_window, [&](const Coordinates & id) { - const int batch_id = id.z() / depth_input; - auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = batch_id * (sum_col_stride_y ); + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); // Compute the leftover term due to b_offset. int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); @@ -291,9 +294,10 @@ void run_offset_contribution(const Window &window, execute_window_loop(collapsed_window, [&](const Coordinates & id) { - const int batch_id = id.z() / depth_input; - auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); int x = window_start_x; for(; x <= (window_end_x - window_step_x); x += window_step_x) |