aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2019-06-24 17:47:51 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2019-07-18 13:17:27 +0000
commit06be6f8d2a316a307fa623150f8adf8f9c3416c5 (patch)
tree0db15a25f2c306fab3d843236f878ec9479b7f57 /src/runtime
parentff2719299ea76a95f20a35a7900875a8152e293a (diff)
downloadComputeLibrary-06be6f8d2a316a307fa623150f8adf8f9c3416c5.tar.gz
COMPMID-2096: Refactor the CLGEMMLowp function selection (heuristic)
Change-Id: I15a8b39e0354d3b6686ed4cc8c361782c0512037 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/1410 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: VidhyaSudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp82
1 files changed, 71 insertions, 11 deletions
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 875e3a2a00..0286cb3d6d 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
@@ -48,7 +49,8 @@ inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_tar
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
- _mm_kernel(),
+ _mm_midgard_kernel(),
+ _mm_native_kernel(),
_mm_reshaped_only_rhs_kernel(),
_mtx_b_reshape_kernel(),
_mtx_a_reduction_kernel(),
@@ -63,6 +65,7 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
_a_offset(0),
_b_offset(0),
_is_gemm_reshaped(true),
+ _is_midgard(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_fuse_output_stage(false)
@@ -84,7 +87,9 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _mm_kernel.set_target(gpu_target);
+ _mm_midgard_kernel.set_target(gpu_target);
+ _mm_native_kernel.set_target(gpu_target);
+ _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
@@ -103,6 +108,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
// Check if we need to reshape the matrix A and matrix B
_is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
+ _is_midgard = gpu_target == GPUTarget::MIDGARD;
if(_is_gemm_reshaped)
{
@@ -159,8 +165,19 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_midgard)
+ {
+ // Configure matrix multiply kernel
+ _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
}
// Configure offset contribution kernel
@@ -178,8 +195,19 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
}
else
{
- // Configure matrix multiply kernel
- _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ if(_is_midgard)
+ {
+ // Configure matrix multiply kernel
+ _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+ }
}
// Configure offset contribution kernel
@@ -232,6 +260,7 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
const unsigned int k = a->dimension(0);
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+ const bool is_midgard = gpu_target == GPUTarget::MIDGARD;
bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
@@ -287,9 +316,21 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+ if(is_midgard)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+ }
}
+
// Validate offset contribution kernel
ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -308,9 +349,21 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
}
else
{
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+ if(is_midgard)
+ {
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
+ }
+ else
+ {
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+ }
}
+
if(output->total_size() != 0)
{
// Validate offset contribution kernel
@@ -353,7 +406,14 @@ void CLGEMMLowpMatrixMultiplyCore::run()
}
else
{
- CLScheduler::get().enqueue(_mm_kernel, false);
+ if(_is_midgard)
+ {
+ CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+ }
+ else
+ {
+ CLScheduler::get().enqueue(_mm_native_kernel, false);
+ }
}
// Run matrix A reduction kernel only if _b_offset is not equal to 0