diff options
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp')
-rw-r--r-- | src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp | 208 |
1 files changed, 114 insertions, 94 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index 39cec6e31c..df8deee44f 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -65,94 +65,94 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup std::string code; const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape(); const bool is_root = (comp_group.get_root_component()->id() == this->id()); + const bool is_lhs_input = comp_group.is_input_tensor(_lhs); + const bool is_rhs_input = comp_group.is_input_tensor(_rhs); - if(is_root) - { - code = + code = R"_( //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- -)_" - // IN_0(LHS) {{lhs}} - // IN_1(RHS) {{rhs}} - // OUT(dst, accum) {{dst}} - // dst = lhs + rhs (mix-precision, broadcast, boundary aware) -R"_( - TILE({{DATA_TYPE}}, M0, N0, {{dst}}); - TILE(uint, M0, 1, g_dst_indirect_y); +)_"; + + if(is_root) { - TILE({{DATA_TYPE}}, M0, N0, lhs_tile); - TILE({{DATA_TYPE}}, M0, N0, rhs_tile); -)_" - // Assuming un-collapsed window + code += R"_( - {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z; - {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z; - - T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile); - T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile); + TILE(uint, M0, 1, g_dst_indirect_y); )_"; - if(is_broadcast) - { - code += + } + + if(is_lhs_input) + { + code += R"_( - T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); + TILE({{DATA_TYPE}}, M0, N0, {{lhs}}); )_"; - } - else - { - code += + } + + if(is_rhs_input) + { + code += R"_( - T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); + TILE({{DATA_TYPE}}, M0, N0, {{rhs}}); )_"; - } + } + code += - // Calculate the destination indirect Y R"_( - LOOP_UNROLLING(int, i, 0, 1, M0, { - g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{out}}_w * {{out}}_h) - 1); - g_dst_indirect_y[i].v += g_ind_2 * (int)({{out}}_w * {{out}}_h); - }) - } - //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- )_"; - } - else // non-root + if(is_lhs_input) { - code = + code += R"_( - //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- -)_" - // IN_0/Out(Accumulator) {{acc}} - // IN_1(Operand) {{operand}} - // acc = operand + acc (mix-precision, broadcast, boundary aware) + {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z; + T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}}); +)_"; + } + + if(is_rhs_input) + { + code += R"_( + {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z; + T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}}); +)_"; + } + + if(is_broadcast) + { + code += + R"_( + T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); +)_"; + } + else { - TILE(DATA_TYPE, M0, N0, operand_tile); - T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile); + code += + R"_( + T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); )_"; + } - if(is_broadcast) - { - code += + if(is_root) + { + // Calculate the destination indirect Y + code += R"_( - T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}}); -)_"; - } - else + LOOP_UNROLLING(int, i, 0, 1, M0, { - code += -R"_( - T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}}); + g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); + g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h); + }) )_"; - } + } + code += R"_( } //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- )_"; - } return code; } @@ -160,86 +160,105 @@ R"_( void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { vtable.declare_variable( + comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), - comp_group.is_intermediate_tensor(_lhs), "lhs"); vtable.declare_variable( + comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), - comp_group.is_intermediate_tensor(_rhs), "rhs"); vtable.declare_variable( + comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), - comp_group.is_intermediate_tensor(_dst), "dst"); } TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { TagLUT lut{}; - const ITensorInfo *accumulator = _lhs; - const ITensorInfo *operand = _rhs; // Local build options lut["meta_kernel_id"] = id(); lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type()); // Arguments and global shared variables - const bool is_root = (comp_group.get_root_component()->id() == this->id()); - if(is_root) + + lut["lhs"] = vtable.get_variable(_lhs); + lut["rhs"] = vtable.get_variable(_rhs); + lut["dst"] = vtable.get_variable(_dst); + lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor()); + + switch(_attributes.operation()) { - lut["lhs"] = vtable.get_variable(_lhs); - lut["rhs"] = vtable.get_variable(_rhs); - lut["dst"] = vtable.get_variable(_dst); - lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor()); + case Attributes::ElementwiseOp::ADD: + lut["ELTWISE_OP"] = "ADD"; + break; + default: + ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); } - else + + ARM_COMPUTE_ERROR_ON( + comp_group.is_intermediate_tensor(_lhs) && + detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON( + comp_group.is_intermediate_tensor(_rhs) && + detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); + + // Set broadcast parameters + // PRE: All tensors are broadcast-compatible + if(_lhs->tensor_shape() != _dst->tensor_shape()) { - // Determine which tensor is the accumulator - if(comp_group.is_intermediate_tensor(_lhs)) + const auto is_broadcast_x = _lhs->dimension(0) == 1U && _dst->dimension(0) != 1U; + const auto is_broadcast_y = _lhs->dimension(1) == 1U && _dst->dimension(1) != 1U; + const auto is_broadcast_z = _lhs->dimension(2) == 1U && _dst->dimension(2) != 1U; + + // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy + if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed lhs win [M0xN0] = [1x1] { - accumulator = _lhs; - operand = _rhs; + lut["lhs_m0"] = "1"; + lut["lhs_n0"] = "1"; + lut["lhs_start_ind_1"] = "0"; + lut["lhs_start_ind_0"] = "0"; } - else if(comp_group.is_intermediate_tensor(_rhs)) + else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed lhs win [M0xN0] = [1xN] { - accumulator = _rhs; - operand = _lhs; + lut["lhs_m0"] = "1"; + lut["lhs_n0"] = "N0"; + lut["lhs_start_ind_1"] = "0"; + lut["lhs_start_ind_0"] = "g_ind_0"; } else { - ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + ARM_COMPUTE_ERROR("Only support lhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions"); } - lut["acc"] = vtable.get_variable(accumulator); - lut["operand"] = vtable.get_variable(operand); } - switch(_attributes.operation()) + else { - case Attributes::ElementwiseOp::ADD: - lut["ELTWISE_OP"] = "ADD"; - break; - default: - ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); + lut["lhs_m0"] = "M0"; + lut["lhs_n0"] = "N0"; + lut["lhs_start_ind_1"] = "g_ind_1"; + lut["lhs_start_ind_0"] = "g_ind_0"; } - ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape"); - const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape()); - // Set broadcast parameters - // PRE: All tensors are broadcast-compatible - if(is_broadcast) + if(_rhs->tensor_shape() != _dst->tensor_shape()) { + const auto is_broadcast_x = _rhs->dimension(0) == 1U && _dst->dimension(0) != 1U; + const auto is_broadcast_y = _rhs->dimension(1) == 1U && _dst->dimension(1) != 1U; + const auto is_broadcast_z = _rhs->dimension(2) == 1U && _dst->dimension(2) != 1U; + // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy - if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1] + if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1] { lut["rhs_m0"] = "1"; lut["rhs_n0"] = "1"; lut["rhs_start_ind_1"] = "0"; lut["rhs_start_ind_0"] = "0"; } - else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN] + else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN] { lut["rhs_m0"] = "1"; lut["rhs_n0"] = "N0"; @@ -258,6 +277,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["rhs_start_ind_1"] = "g_ind_1"; lut["rhs_start_ind_0"] = "g_ind_0"; } + return lut; } |