aboutsummaryrefslogtreecommitdiff
path: root/src/dynamic_fusion/sketch/gpu/template_writer/cl
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2022-12-16 14:45:57 +0000
committerViet-Hoa Do <viet-hoa.do@arm.com>2022-12-30 13:59:23 +0000
commit3558c5840e7c973e2b1a86ae3a9335b44cad59d4 (patch)
treeb5f14b344ff8bc03e5143a54a5f3480263db543e /src/dynamic_fusion/sketch/gpu/template_writer/cl
parent9d3bd41030366326e9c8afe5db3a5812a76b135b (diff)
downloadComputeLibrary-3558c5840e7c973e2b1a86ae3a9335b44cad59d4.tar.gz
Add temporary tile support for dynamic fusion
* Multiple intermediate tensors can share the same tile. - A simple operator can reuse the input tensor for the result if the input tensor has the same shape, data type and it is only consumed by that operator. - The special case is a simple operator and an output operator consume the same tensor. However as the output operator doesn't change the content of the input tensor, it doesn't count as "consuming" the input tensor. * These temporary tiles are declared automatically by the template writer. Individual operator doesn't need to generate output tile declaration. * Cast is now simple operator. Resolves: COMPMID-5778 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I232647ac976645e2d266a62e055b9eb48c356a8e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8877 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/template_writer/cl')
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp12
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp37
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp9
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp9
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp208
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp8
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp5
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp4
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp20
9 files changed, 181 insertions, 131 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index 8adf056912..53e74b4187 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -67,14 +67,14 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_
// IN(src) {{src}}
// OUT(dst, accum) {{dst}}
-TILE({{DATA_TYPE}}, M0, N0, {{dst}});
+TILE({{DATA_TYPE}}, M0, N0, {{src}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
{{src}}_offset_first_element_in_bytes += g_ind_2 * {{src}}_stride_z;
- T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{dst}});
+ T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{src}});
- T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{dst}}, {{dst}});
+ T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
}
LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -91,7 +91,7 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
// IN/OUT(src, accum) {{src}}
{
- T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{src}});
+ T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
}
)_";
}
@@ -104,15 +104,15 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 6ab3a68bb0..dcb43f9783 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -54,20 +54,26 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group)
ARM_COMPUTE_UNUSED(comp_group);
const std::string kernel_name = get_name();
+ const auto is_root = (comp_group.get_root_component()->id() == this->id());
std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+//------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
+)_";
+
+ if(is_root)
+ {
+ code += R"_(
// IN_0(src) {{src}}
// OUT(dst, accum) {{dst}}
-TILE({{DATA_TYPE_OUT}}, M0, N0, {{dst}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
{{src}}_offset_first_element_in_bytes += get_global_id(2) * {{src}}_stride_z;
- TILE({{DATA_TYPE_IN}}, M0, N0, in_data);
- T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, in_data);
+ TILE({{DATA_TYPE_IN}}, M0, N0, {{tmp}});
+ T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{tmp}});
)_";
+ }
code += R"_(
LOOP_UNROLLING(int, m0, 0, 1, M0,
@@ -77,20 +83,20 @@ TILE(uint, M0, 1, g_dst_indirect_y);
if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
{
code += R"_(
- in_data[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
+ {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
)_";
}
if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
{
code += R"_(
- {{dst}}[m0].v = CONVERT_SAT(in_data[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
+ {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
)_";
}
else
{
code += R"_(
- {{dst}}[m0].v = CONVERT(in_data[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
+ {{dst}}[m0].v = CONVERT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
)_";
}
@@ -98,7 +104,9 @@ TILE(uint, M0, 1, g_dst_indirect_y);
})
)_";
- code += R"_(
+ if(is_root)
+ {
+ code += R"_(
LOOP_UNROLLING(int, i, 0, 1, M0,
{
g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
@@ -106,7 +114,11 @@ TILE(uint, M0, 1, g_dst_indirect_y);
g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
})
}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+ }
+
+ code += R"_(
+//------------------ END KERNEL {{meta_kernel_id}} CAST ---------------------
)_";
return code;
@@ -115,27 +127,28 @@ TILE(uint, M0, 1, g_dst_indirect_y);
void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
- ARM_COMPUTE_UNUSED(comp_group);
+ const auto is_root = (comp_group.get_root_component()->id() == this->id());
TagLUT lut{};
// Arguments and global shared variables
lut["src"] = vtable.get_variable(_src);
lut["dst"] = vtable.get_variable(_dst);
+ lut["tmp"] = (is_root) ? lut["src"].value + "_in_data" : lut["src"];
const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
lut["arg_dst"] = dst_argument.uniq_name;
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index 6fa77aafe3..ab7cc9f05a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -81,7 +81,6 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &
code += R"_(
// OUT(dst, accum) {{dst}}
-TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
@@ -206,9 +205,9 @@ void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable
GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(input_type),
- comp_group.is_intermediate_tensor(_src),
"src");
const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
@@ -216,23 +215,23 @@ void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable
GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
vtable.declare_variable(
+ comp_group,
_weight,
GpuKernelArgumentInfo(weight_type),
- comp_group.is_intermediate_tensor(_weight),
"weight");
if(_bias != nullptr && _bias->has_valid_id()) // optional bias
{
vtable.declare_variable(
+ comp_group,
_bias,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
- comp_group.is_intermediate_tensor(_bias),
"bias");
}
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index 26399c50a9..c6e14f90c5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -86,7 +86,6 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com
code += R"_(
// OUT(dst, accum) {{dst}}
-TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
@@ -227,30 +226,30 @@ code += R"_(
void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
vtable.declare_variable(
+ comp_group,
_weight,
GpuKernelArgumentInfo(weight_type),
- comp_group.is_intermediate_tensor(_weight),
"weight");
if(_bias && _bias->has_valid_id()) // optional bias
{
vtable.declare_variable(
+ comp_group,
_bias,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
- comp_group.is_intermediate_tensor(_bias),
"bias");
}
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 39cec6e31c..df8deee44f 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -65,94 +65,94 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup
std::string code;
const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape();
const bool is_root = (comp_group.get_root_component()->id() == this->id());
+ const bool is_lhs_input = comp_group.is_input_tensor(_lhs);
+ const bool is_rhs_input = comp_group.is_input_tensor(_rhs);
- if(is_root)
- {
- code =
+ code =
R"_(
//------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_"
- // IN_0(LHS) {{lhs}}
- // IN_1(RHS) {{rhs}}
- // OUT(dst, accum) {{dst}}
- // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
-R"_(
- TILE({{DATA_TYPE}}, M0, N0, {{dst}});
- TILE(uint, M0, 1, g_dst_indirect_y);
+)_";
+
+ if(is_root)
{
- TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
- TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
-)_"
- // Assuming un-collapsed window
+ code +=
R"_(
- {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
- {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
-
- T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile);
- T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile);
+ TILE(uint, M0, 1, g_dst_indirect_y);
)_";
- if(is_broadcast)
- {
- code +=
+ }
+
+ if(is_lhs_input)
+ {
+ code +=
R"_(
- T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+ TILE({{DATA_TYPE}}, M0, N0, {{lhs}});
)_";
- }
- else
- {
- code +=
+ }
+
+ if(is_rhs_input)
+ {
+ code +=
R"_(
- T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+ TILE({{DATA_TYPE}}, M0, N0, {{rhs}});
)_";
- }
+ }
+
code +=
- // Calculate the destination indirect Y
R"_(
- LOOP_UNROLLING(int, i, 0, 1, M0,
{
- g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{out}}_w * {{out}}_h) - 1);
- g_dst_indirect_y[i].v += g_ind_2 * (int)({{out}}_w * {{out}}_h);
- })
- }
- //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
)_";
- }
- else // non-root
+ if(is_lhs_input)
{
- code =
+ code +=
R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_"
- // IN_0/Out(Accumulator) {{acc}}
- // IN_1(Operand) {{operand}}
- // acc = operand + acc (mix-precision, broadcast, boundary aware)
+ {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
+ T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
+)_";
+ }
+
+ if(is_rhs_input)
+ {
+ code +=
R"_(
+ {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
+ T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
+)_";
+ }
+
+ if(is_broadcast)
+ {
+ code +=
+ R"_(
+ T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
+)_";
+ }
+ else
{
- TILE(DATA_TYPE, M0, N0, operand_tile);
- T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile);
+ code +=
+ R"_(
+ T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
)_";
+ }
- if(is_broadcast)
- {
- code +=
+ if(is_root)
+ {
+ // Calculate the destination indirect Y
+ code +=
R"_(
- T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
-)_";
- }
- else
+ LOOP_UNROLLING(int, i, 0, 1, M0,
{
- code +=
-R"_(
- T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
+ g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
+ g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
+ })
)_";
- }
+ }
+
code +=
R"_(
}
//------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
)_";
- }
return code;
}
@@ -160,86 +160,105 @@ R"_(
void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_lhs,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_lhs),
"lhs");
vtable.declare_variable(
+ comp_group,
_rhs,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_rhs),
"rhs");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
TagLUT lut{};
- const ITensorInfo *accumulator = _lhs;
- const ITensorInfo *operand = _rhs;
// Local build options
lut["meta_kernel_id"] = id();
lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type());
// Arguments and global shared variables
- const bool is_root = (comp_group.get_root_component()->id() == this->id());
- if(is_root)
+
+ lut["lhs"] = vtable.get_variable(_lhs);
+ lut["rhs"] = vtable.get_variable(_rhs);
+ lut["dst"] = vtable.get_variable(_dst);
+ lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
+
+ switch(_attributes.operation())
{
- lut["lhs"] = vtable.get_variable(_lhs);
- lut["rhs"] = vtable.get_variable(_rhs);
- lut["dst"] = vtable.get_variable(_dst);
- lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor());
+ case Attributes::ElementwiseOp::ADD:
+ lut["ELTWISE_OP"] = "ADD";
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
}
- else
+
+ ARM_COMPUTE_ERROR_ON(
+ comp_group.is_intermediate_tensor(_lhs) &&
+ detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+ ARM_COMPUTE_ERROR_ON(
+ comp_group.is_intermediate_tensor(_rhs) &&
+ detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+
+ // Set broadcast parameters
+ // PRE: All tensors are broadcast-compatible
+ if(_lhs->tensor_shape() != _dst->tensor_shape())
{
- // Determine which tensor is the accumulator
- if(comp_group.is_intermediate_tensor(_lhs))
+ const auto is_broadcast_x = _lhs->dimension(0) == 1U && _dst->dimension(0) != 1U;
+ const auto is_broadcast_y = _lhs->dimension(1) == 1U && _dst->dimension(1) != 1U;
+ const auto is_broadcast_z = _lhs->dimension(2) == 1U && _dst->dimension(2) != 1U;
+
+ // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
+ if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed lhs win [M0xN0] = [1x1]
{
- accumulator = _lhs;
- operand = _rhs;
+ lut["lhs_m0"] = "1";
+ lut["lhs_n0"] = "1";
+ lut["lhs_start_ind_1"] = "0";
+ lut["lhs_start_ind_0"] = "0";
}
- else if(comp_group.is_intermediate_tensor(_rhs))
+ else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed lhs win [M0xN0] = [1xN]
{
- accumulator = _rhs;
- operand = _lhs;
+ lut["lhs_m0"] = "1";
+ lut["lhs_n0"] = "N0";
+ lut["lhs_start_ind_1"] = "0";
+ lut["lhs_start_ind_0"] = "g_ind_0";
}
else
{
- ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+ ARM_COMPUTE_ERROR("Only support lhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions");
}
- lut["acc"] = vtable.get_variable(accumulator);
- lut["operand"] = vtable.get_variable(operand);
}
- switch(_attributes.operation())
+ else
{
- case Attributes::ElementwiseOp::ADD:
- lut["ELTWISE_OP"] = "ADD";
- break;
- default:
- ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
+ lut["lhs_m0"] = "M0";
+ lut["lhs_n0"] = "N0";
+ lut["lhs_start_ind_1"] = "g_ind_1";
+ lut["lhs_start_ind_0"] = "g_ind_0";
}
- ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape");
- const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape());
- // Set broadcast parameters
- // PRE: All tensors are broadcast-compatible
- if(is_broadcast)
+ if(_rhs->tensor_shape() != _dst->tensor_shape())
{
+ const auto is_broadcast_x = _rhs->dimension(0) == 1U && _dst->dimension(0) != 1U;
+ const auto is_broadcast_y = _rhs->dimension(1) == 1U && _dst->dimension(1) != 1U;
+ const auto is_broadcast_z = _rhs->dimension(2) == 1U && _dst->dimension(2) != 1U;
+
// Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
- if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
+ if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
{
lut["rhs_m0"] = "1";
lut["rhs_n0"] = "1";
lut["rhs_start_ind_1"] = "0";
lut["rhs_start_ind_0"] = "0";
}
- else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
+ else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
{
lut["rhs_m0"] = "1";
lut["rhs_n0"] = "N0";
@@ -258,6 +277,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
lut["rhs_start_ind_1"] = "g_ind_1";
lut["rhs_start_ind_0"] = "g_ind_0";
}
+
return lut;
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
index 05bdd27f11..8f1ed95351 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
@@ -190,21 +190,21 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_sum,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_sum),
"sum");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
@@ -274,4 +274,4 @@ Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index a2c04d94e5..bcb6492b43 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
@@ -54,7 +54,6 @@ std::string ClTemplateResize::get_component_code(const IGpuTemplateComponentWrit
std::string code = R"_(
//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-TILE({{DST_DATA_TYPE}}, 1, N0, {{dst}});
TILE(uint, 1, 1, g_dst_indirect_y);
{
const int yo = g_ind_2 % {{arg_dst}}_h;
@@ -180,15 +179,15 @@ TILE(uint, 1, 1, g_dst_indirect_y);
void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index ef4f2f22a1..217214ced3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -62,14 +62,14 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group
void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index eed481f109..2ab6316947 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -191,6 +191,26 @@ std::string ClTemplateWriter::write_code()
code += write_global_section();
code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+ {
+ const auto tiles = _components.get_tiles();
+ std::stringstream tiles_ss;
+
+ tiles_ss << " //------------------ START TILE DECLARATION ---------------------\n";
+
+ for(auto tile : tiles)
+ {
+ const auto var = _vtable.get_variable(tile);
+ const auto data_type = get_cl_type_from_data_type(tile->data_type());
+ const auto var_name = var.uniq_name;
+
+ tiles_ss << " TILE(" << data_type << ", M0, N0, " << var_name << ");\n";
+ }
+
+ tiles_ss << " //------------------ END TILE DECLARATION ---------------------\n";
+
+ code += tiles_ss.str();
+ }
+
for(const auto &component_code : component_codes)
{
code += component_code;