From 3d8fe497fc76ec6ad265c03fe02e29ed2ddf2d93 Mon Sep 17 00:00:00 2001 From: giuros01 Date: Tue, 1 Oct 2019 12:17:49 +0100 Subject: INFPRF-609:Performance Issue of the Latest ArmCL We were creating too many small GEMM workloads. It affects performance when the number of thread is small and the matrices are bigger (especially when more single-threaded process are running on the same machine) Change-Id: I807019a7b2d043ca72b4bca11eb0b1960da00694 Signed-off-by: giuros01 Reviewed-on: https://review.mlplatform.org/c/2012 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h | 1 - src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h index d3dda9a95f..eeea0babf1 100644 --- a/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h +++ b/arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h @@ -141,7 +141,6 @@ private: std::vector _mm_workloads{}; std::vector _workloads{}; std::string _tag{}; - unsigned int _num_windows{ 1 }; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDWRAPPER_H__ */ diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp index 41d7d1ff76..79e40a7181 100644 --- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp +++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp @@ -212,7 +212,7 @@ void NEGEMMInterleavedWrapper::prepare() //Maximum number of workloads to create: const unsigned int num_threads = NEScheduler::get().num_threads(); - const unsigned int max_iterations = std::max(num_threads, _num_windows); + const unsigned int max_iterations = num_threads == 1 ? 1 : num_threads; //Maximum number of iterations the parameters allow: const unsigned int num_iterations = _batch_window.num_iterations_total(); // Keep the smallest of the two: @@ -362,7 +362,6 @@ void NEGEMMInterleavedWrapper::configure(const ITensor *a, const ITensor *b, ITe // Get strategy std::unique_ptr strategy = detail::create_strategy(gemm_kernel_info.name); - _num_windows = iceildiv(_params.M, strategy->out_height()) * _params.batches; ARM_COMPUTE_ERROR_ON(strategy == nullptr); if(!_pretranspose_b) -- cgit v1.2.1