From b3077fbaee868579f9a41888fef1f71286d6757c Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Tue, 3 Jan 2023 17:59:14 +0000 Subject: LHS broadcasting addition for dynamic fusion * Binary elementwise operator now can have broadcasting in either X dimension, Y+Z dimension, or both, in either LHS or RHS operand. * Fix bug in CL code to support batching. Resolves: COMPMID-5704 Signed-off-by: Viet-Hoa Do Change-Id: I51b04986d30861f255ca9f754adffa0e6c85a26b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8898 Reviewed-by: SiCong Li Reviewed-by: Ramy Elgammal Tested-by: Arm Jenkins Dynamic-Fusion: Ramy Elgammal Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../dynamic_fusion/sketch/gpu/operators/GpuAdd.h | 3 +- src/core/CL/cl_kernels/tile_helpers.h | 26 ++++- .../components/cl/ClComponentElementwiseBinary.cpp | 32 ++++-- .../cl/ClTemplateElementwiseBinary.cpp | 122 ++++++--------------- tests/datasets/ShapeDatasets.h | 18 ++- tests/validation/dynamic_fusion/gpu/cl/Add.cpp | 16 ++- 6 files changed, 110 insertions(+), 107 deletions(-) diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h index 833f341b2f..61f7b406b2 100644 --- a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h +++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -44,7 +44,6 @@ public: /** Create an operator and fuse it into the workload sketch. * @note If @ref validate_op() fails, the creation also fails and may throw an error. * @note If @ref validate_op() fails, @p sketch remains unchanged and valid. - * @note Batching is not supported yet * * Valid data type configurations: * |lhs |rhs |dst | diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 861ea63eca..acc174d04f 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -1059,6 +1059,9 @@ }) #define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) +#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) + #define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) /** Element-wise scale with a constant value @@ -1101,6 +1104,27 @@ }) \ }) +/** Element-wise operation with LHS broadcasted (LHS has the X dimension only) + * + * @note Performs: LHS[broadcasted] OP RHS = DST + * @note Both tiles must have same data type + * + * @param[in] T_ELWISE_OP Elementwise operator to perform + * @param[in] DST_DATA_TYPE DST data type + * @param[in] M0 Number of RHS rows + * @param[in] N0 Number of RHS columns + * @param[in] lhs LHS tile + * @param[in] rhs RHS tile + * @param[out] dst DST tile + */ +#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ + ({ \ + LOOP_UNROLLING(int, _m0, 0, 1, M0, \ + { \ + dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ + }) \ + }) + #define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) #define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp index a17d835ac6..9b006b13ce 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -66,8 +66,30 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack &t const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!rhs_in_place && !lhs_in_place) && detail::have_different_dimensions(lhs->tensor_shape(), dst->tensor_shape(), 0), - "Only the rhs operand can be broadcast to match the accumulator's (lhs) shape"); + + const auto &lhs_shape = lhs->tensor_shape(); + const auto &rhs_shape = rhs->tensor_shape(); + const auto &dst_shape = dst->tensor_shape(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(lhs_shape, dst_shape, 0) && detail::have_different_dimensions(rhs_shape, dst_shape, 0), + "Only LHS or RHS can be broadcasting, not both."); + + // Dimension Y and Z are collapsed together in the current kernel implementation, + // hence they cannot be independently broadcast or non-broadcast. + // See: ClTemplateElementwiseBinary::get_window + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]), + "Dimension Y and Z must both be either broadcast or non-broadcast."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(lhs_shape, dst_shape, 3), + "LHS broadcast in dimension 3 or higher is not supported."); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(rhs_shape, dst_shape, 3), + "RHS broadcast in dimension 3 or higher is not supported."); + // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); @@ -76,10 +98,6 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack &t ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); - // Batching case not supported yet - const size_t idx_batch = get_data_layout_dimension_index(lhs->data_layout(), DataLayoutDimension::BATCHES); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs->tensor_shape()[idx_batch] != 1) || (rhs->tensor_shape()[idx_batch] != 1) || (dst->tensor_shape()[idx_batch] != 1), "Batching case not supported yet"); - // All tensor infos are initialized ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index df8deee44f..01017ed909 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,9 +61,7 @@ std::string ClTemplateElementwiseBinary::get_name() const std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const { - ARM_COMPUTE_UNUSED(comp_group); std::string code; - const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape(); const bool is_root = (comp_group.get_root_component()->id() == this->id()); const bool is_lhs_input = comp_group.is_input_tensor(_lhs); const bool is_rhs_input = comp_group.is_input_tensor(_rhs); @@ -85,7 +83,7 @@ R"_( { code += R"_( - TILE({{DATA_TYPE}}, M0, N0, {{lhs}}); + TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}}); )_"; } @@ -93,7 +91,7 @@ R"_( { code += R"_( - TILE({{DATA_TYPE}}, M0, N0, {{rhs}}); + TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}}); )_"; } @@ -106,7 +104,7 @@ R"_( { code += R"_( - {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z; + {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}}); )_"; } @@ -115,25 +113,15 @@ R"_( { code += R"_( - {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z; + {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}}); )_"; } - if(is_broadcast) - { - code += - R"_( - T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); -)_"; - } - else - { - code += - R"_( - T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); + code += +R"_( + T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); )_"; - } if(is_root) { @@ -210,73 +198,33 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt // Set broadcast parameters // PRE: All tensors are broadcast-compatible - if(_lhs->tensor_shape() != _dst->tensor_shape()) - { - const auto is_broadcast_x = _lhs->dimension(0) == 1U && _dst->dimension(0) != 1U; - const auto is_broadcast_y = _lhs->dimension(1) == 1U && _dst->dimension(1) != 1U; - const auto is_broadcast_z = _lhs->dimension(2) == 1U && _dst->dimension(2) != 1U; - - // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy - if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed lhs win [M0xN0] = [1x1] - { - lut["lhs_m0"] = "1"; - lut["lhs_n0"] = "1"; - lut["lhs_start_ind_1"] = "0"; - lut["lhs_start_ind_0"] = "0"; - } - else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed lhs win [M0xN0] = [1xN] - { - lut["lhs_m0"] = "1"; - lut["lhs_n0"] = "N0"; - lut["lhs_start_ind_1"] = "0"; - lut["lhs_start_ind_0"] = "g_ind_0"; - } - else - { - ARM_COMPUTE_ERROR("Only support lhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions"); - } - } - else - { - lut["lhs_m0"] = "M0"; - lut["lhs_n0"] = "N0"; - lut["lhs_start_ind_1"] = "g_ind_1"; - lut["lhs_start_ind_0"] = "g_ind_0"; - } - - if(_rhs->tensor_shape() != _dst->tensor_shape()) - { - const auto is_broadcast_x = _rhs->dimension(0) == 1U && _dst->dimension(0) != 1U; - const auto is_broadcast_y = _rhs->dimension(1) == 1U && _dst->dimension(1) != 1U; - const auto is_broadcast_z = _rhs->dimension(2) == 1U && _dst->dimension(2) != 1U; - - // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy - if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1] - { - lut["rhs_m0"] = "1"; - lut["rhs_n0"] = "1"; - lut["rhs_start_ind_1"] = "0"; - lut["rhs_start_ind_0"] = "0"; - } - else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN] - { - lut["rhs_m0"] = "1"; - lut["rhs_n0"] = "N0"; - lut["rhs_start_ind_1"] = "0"; - lut["rhs_start_ind_0"] = "g_ind_0"; - } - else - { - ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions"); - } - } - else - { - lut["rhs_m0"] = "M0"; - lut["rhs_n0"] = "N0"; - lut["rhs_start_ind_1"] = "g_ind_1"; - lut["rhs_start_ind_0"] = "g_ind_0"; - } + const auto &lhs_dims = _lhs->tensor_shape(); + const auto &rhs_dims = _rhs->tensor_shape(); + const auto &dst_dims = _dst->tensor_shape(); + + const auto lhs_broadcast_x = dst_dims[0] != 1 && lhs_dims[0] == 1; + const auto rhs_broadcast_x = dst_dims[0] != 1 && rhs_dims[0] == 1; + const auto lhs_broadcast_y = dst_dims[1] != 1 && lhs_dims[1] == 1; + const auto rhs_broadcast_y = dst_dims[1] != 1 && rhs_dims[1] == 1; + const auto lhs_broadcast_z = dst_dims[2] != 1 && lhs_dims[2] == 1; + const auto rhs_broadcast_z = dst_dims[2] != 1 && rhs_dims[2] == 1; + + const auto lhs_broadcast_yz = lhs_broadcast_y && lhs_broadcast_z; + const auto rhs_broadcast_yz = rhs_broadcast_y && rhs_broadcast_z; + + lut["lhs_n0"] = (lhs_broadcast_x) ? "1" : "N0"; + lut["lhs_start_ind_0"] = (lhs_broadcast_x) ? "0" : "g_ind_0"; + lut["rhs_n0"] = (rhs_broadcast_x) ? "1" : "N0"; + lut["rhs_start_ind_0"] = (rhs_broadcast_x) ? "0" : "g_ind_0"; + + lut["lhs_m0"] = (lhs_broadcast_yz) ? "1" : "M0"; + lut["lhs_start_ind_1"] = (lhs_broadcast_yz) ? "0" : "g_ind_1"; + lut["rhs_m0"] = (rhs_broadcast_yz) ? "1" : "M0"; + lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1"; + + lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : + (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : + ""; return lut; } diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index 047457c99e..c1e61444a8 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -308,13 +308,21 @@ public: : ZipDataset( ShapeDataset("Shape0", { - TensorShape{ 9U, 9U, 5U }, - TensorShape{ 27U, 13U, 2U }, + TensorShape{ 1U, 3U, 4U, 2U }, // LHS broadcast X + TensorShape{ 6U, 4U, 2U, 3U }, // RHS broadcast X + TensorShape{ 7U, 1U, 1U, 4U }, // LHS broadcast Y, Z + TensorShape{ 8U, 5U, 6U, 3U }, // RHS broadcast Y, Z + TensorShape{ 1U, 1U, 1U, 2U }, // LHS broadcast X, Y, Z + TensorShape{ 2U, 6U, 4U, 3U }, // RHS broadcast X, Y, Z }), ShapeDataset("Shape1", { - TensorShape{ 1U, 1U, 1U }, // Broadcast in X, Y, Z - TensorShape{ 27U, 1U, 1U }, // Broadcast in Y and Z + TensorShape{ 5U, 3U, 4U, 2U }, + TensorShape{ 1U, 4U, 2U, 3U }, + TensorShape{ 7U, 2U, 3U, 4U }, + TensorShape{ 8U, 1U, 1U, 3U }, + TensorShape{ 4U, 7U, 3U, 2U }, + TensorShape{ 1U, 1U, 1U, 3U }, })) { } diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp index 3743fbb664..1451ab3de8 100644 --- a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp +++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -54,9 +54,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16), // S16 is valid data type for Add TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32), // S32 is valid data type for Add TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching shapes - TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting not allowed for lhs + TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting allowed for lhs TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), - TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching not supported + TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32), // Broadcast Y dimension is not allowed + TensorInfo(TensorShape( 3U, 8U, 9U), 1, DataType::S16), // Broadcast Z dimension is not allowed + TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching is allowed }), framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16), @@ -65,7 +67,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting allowed for rhs - TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching not supported + TensorInfo(TensorShape(15U, 1U, 3U), 1, DataType::F32), + TensorInfo(TensorShape( 3U, 8U, 1U), 1, DataType::S16), + TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), })), framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), @@ -74,9 +78,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(15U, 23U, 3U), 1, DataType::F32), + TensorInfo(TensorShape( 3U, 8U, 9U), 1, DataType::S16), TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), })), - framework::dataset::make("Expected", { true, false, true, true, false, false, true, false})), + framework::dataset::make("Expected", { true, false, true, true, false, true, true, false, false, true})), input1_info, input2_info, output_info, expected) { // Create a new workload sketch -- cgit v1.2.1