From 0d548048a60035349b90b903924fbf38c74796b9 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 3 Oct 2019 15:12:09 +0100 Subject: COMPMID-2572: Update the heuristic in CLGEMM for FP16 Change-Id: Ia7f248d72bf2690a8a4dae3f2e2afc983109bd6e Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/2035 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena --- ...GEMMReshapedOnlyRHSKernelConfigurationBifrost.h | 3 + .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h | 4 +- ...MMReshapedOnlyRHSKernelConfigurationBifrost.cpp | 112 +++++++++++++++++---- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 2 +- src/runtime/CL/functions/CLGEMM.cpp | 2 +- 5 files changed, 102 insertions(+), 21 deletions(-) diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h index 4e6112e269..5eb4aadb09 100644 --- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h +++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h @@ -55,6 +55,9 @@ private: std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); std::pair configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); std::pair configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h index e52d3ca099..b9a1ba0bf6 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h @@ -51,7 +51,7 @@ public: CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4. * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0. * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 @@ -70,7 +70,7 @@ public: const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * - * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] input0 Input tensor info for the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4. * @param[in] input1 Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3. * @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0. * @param[in] output Output tensor info. Data type supported: same as @p input0 diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp index 9f3fc3aae7..5955bac384 100644 --- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp +++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp @@ -42,9 +42,6 @@ CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::CLGEMMReshapedOnlyRHSKernelConf std::pair CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8); - ARM_COMPUTE_UNUSED(data_type); - using ConfigurationFunctionExecutorPtr = std::pair (CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); @@ -52,6 +49,7 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi static std::map gemm_configs_G51 = { { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_f32 }, + { DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_f16 }, { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 } }; @@ -59,6 +57,7 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi static std::map gemm_configs_G76 = { { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32 }, + { DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f16 }, { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 } }; @@ -66,17 +65,39 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi static std::map gemm_configs_G7x = { { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32 }, + { DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f16 }, { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 } }; switch(_target) { case GPUTarget::G76: - return (this->*gemm_configs_G76[data_type])(m, n, k, b); + if (gemm_configs_G76.find(data_type) != gemm_configs_G76.end()) + { + return (this->*gemm_configs_G76[data_type])(m, n, k, b); + } + else + { + ARM_COMPUTE_ERROR("Not supported data type"); + } case GPUTarget::G51: - return (this->*gemm_configs_G51[data_type])(m, n, k, b); + if (gemm_configs_G51.find(data_type) != gemm_configs_G51.end()) + { + return (this->*gemm_configs_G51[data_type])(m, n, k, b); + } + else + { + ARM_COMPUTE_ERROR("Not supported data type"); + } default: - return (this->*gemm_configs_G7x[data_type])(m, n, k, b); + if (gemm_configs_G7x.find(data_type) != gemm_configs_G7x.end()) + { + return (this->*gemm_configs_G7x[data_type])(m, n, k, b); + } + else + { + ARM_COMPUTE_ERROR("Not supported data type"); + } } } @@ -89,12 +110,12 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi { if(n > 2048) { - const unsigned int h0 = std::max(n / 4, static_cast(1)); + const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); } else { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); } } @@ -111,7 +132,7 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi if(m == 1) { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); } else @@ -128,7 +149,7 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi if(m == 1) { const unsigned int n0 = n < 1280? 2 : 4; - const unsigned int h0 = std::max(n / n0, static_cast(1)); + const unsigned int h0 = std::max(n / n0, 1U); return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true); } else @@ -137,6 +158,63 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi } } +std::pair CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n > 2048) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + std::pair CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); @@ -146,12 +224,12 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi { if(m == 1) { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); } else { - const unsigned int h0 = std::max(n / 4, static_cast(1)); + const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true); } } @@ -159,12 +237,12 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi { if(m == 1) { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); } else { - const unsigned int h0 = std::max(n / 4, static_cast(1)); + const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, h0, false, true, false, true); } } @@ -177,7 +255,7 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi if(m == 1) { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); } else @@ -193,12 +271,12 @@ std::pair CLGEMMReshapedOnlyRHSKernelConfi if(m == 1) { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); } else { - const unsigned int h0 = std::max(n / 2, static_cast(1)); + const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); } } diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index 3d5e1486a6..24b2301283 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -57,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 2a027d872c..8d460142e5 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -109,7 +109,7 @@ CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsign { if((m == 1) || (!reshape_b_only_on_first_run)) { - gemm_type = GEMMType::NATIVE; + gemm_type = GEMMType::RESHAPED_ONLY_RHS; } else { -- cgit v1.2.1