aboutsummaryrefslogtreecommitdiff
path: root/src/dynamic_fusion/sketch/gpu/template_writer/cl
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/template_writer/cl')
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp12
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp37
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp9
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp9
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp208
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp8
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp5
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp4
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp20
9 files changed, 181 insertions, 131 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index 8adf056912..53e74b4187 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -67,14 +67,14 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_
// IN(src) {{src}}
// OUT(dst, accum) {{dst}}
-TILE({{DATA_TYPE}}, M0, N0, {{dst}});
+TILE({{DATA_TYPE}}, M0, N0, {{src}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
{{src}}_offset_first_element_in_bytes += g_ind_2 * {{src}}_stride_z;
- T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{dst}});
+ T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{src}});
- T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{dst}}, {{dst}});
+ T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
}
LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -91,7 +91,7 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
// IN/OUT(src, accum) {{src}}
{
- T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{src}});
+ T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
}
)_";
}
@@ -104,15 +104,15 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 6ab3a68bb0..dcb43f9783 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -54,20 +54,26 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group)
ARM_COMPUTE_UNUSED(comp_group);
const std::string kernel_name = get_name();
+ const auto is_root = (comp_group.get_root_component()->id() == this->id());
std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+//------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
+)_";
+
+ if(is_root)
+ {
+ code += R"_(
// IN_0(src) {{src}}
// OUT(dst, accum) {{dst}}
-TILE({{DATA_TYPE_OUT}}, M0, N0, {{dst}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
{{src}}_offset_first_element_in_bytes += get_global_id(2) * {{src}}_stride_z;
- TILE({{DATA_TYPE_IN}}, M0, N0, in_data);
- T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, in_data);
+ TILE({{DATA_TYPE_IN}}, M0, N0, {{tmp}});
+ T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{tmp}});
)_";
+ }
code += R"_(
LOOP_UNROLLING(int, m0, 0, 1, M0,
@@ -77,20 +83,20 @@ TILE(uint, M0, 1, g_dst_indirect_y);
if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
{
code += R"_(
- in_data[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
+ {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
)_";
}
if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
{
code += R"_(
- {{dst}}[m0].v = CONVERT_SAT(in_data[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
+ {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
)_";
}
else
{
code += R"_(
- {{dst}}[m0].v = CONVERT(in_data[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
+ {{dst}}[m0].v = CONVERT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
)_";
}
@@ -98,7 +104,9 @@ TILE(uint, M0, 1, g_dst_indirect_y);
})
)_";
- code += R"_(
+ if(is_root)
+ {
+ code += R"_(
LOOP_UNROLLING(int, i, 0, 1, M0,
{
g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
@@ -106,7 +114,11 @@ TILE(uint, M0, 1, g_dst_indirect_y);
g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
})
}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+ }
+
+ code += R"_(
+//------------------ END KERNEL {{meta_kernel_id}} CAST ---------------------
)_";
return code;
@@ -115,27 +127,28 @@ TILE(uint, M0, 1, g_dst_indirect_y);
void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
- ARM_COMPUTE_UNUSED(comp_group);
+ const auto is_root = (comp_group.get_root_component()->id() == this->id());
TagLUT lut{};
// Arguments and global shared variables
lut["src"] = vtable.get_variable(_src);
lut["dst"] = vtable.get_variable(_dst);
+ lut["tmp"] = (is_root) ? lut["src"].value + "_in_data" : lut["src"];
const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
lut["arg_dst"] = dst_argument.uniq_name;
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index 6fa77aafe3..ab7cc9f05a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -81,7 +81,6 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &
code += R"_(
// OUT(dst, accum) {{dst}}
-TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
@@ -206,9 +205,9 @@ void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable
GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(input_type),
- comp_group.is_intermediate_tensor(_src),
"src");
const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
@@ -216,23 +215,23 @@ void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable
GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
vtable.declare_variable(
+ comp_group,
_weight,
GpuKernelArgumentInfo(weight_type),
- comp_group.is_intermediate_tensor(_weight),
"weight");
if(_bias != nullptr && _bias->has_valid_id()) // optional bias
{
vtable.declare_variable(
+ comp_group,
_bias,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
- comp_group.is_intermediate_tensor(_bias),
"bias");
}
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index 26399c50a9..c6e14f90c5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -86,7 +86,6 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com
code += R"_(
// OUT(dst, accum) {{dst}}
-TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
TILE(uint, M0, 1, g_dst_indirect_y);
{
@@ -227,30 +226,30 @@ code += R"_(
void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
vtable.declare_variable(
+ comp_group,
_weight,
GpuKernelArgumentInfo(weight_type),
- comp_group.is_intermediate_tensor(_weight),
"weight");
if(_bias && _bias->has_valid_id()) // optional bias
{
vtable.declare_variable(
+ comp_group,
_bias,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
- comp_group.is_intermediate_tensor(_bias),
"bias");
}
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 39cec6e31c..df8deee44f 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -65,94 +65,94 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup
std::string code;
const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape();
const bool is_root = (comp_group.get_root_component()->id() == this->id());
+ const bool is_lhs_input = comp_group.is_input_tensor(_lhs);
+ const bool is_rhs_input = comp_group.is_input_tensor(_rhs);
- if(is_root)
- {
- code =
+ code =
R"_(
//------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_"
- // IN_0(LHS) {{lhs}}
- // IN_1(RHS) {{rhs}}
- // OUT(dst, accum) {{dst}}
- // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
-R"_(
- TILE({{DATA_TYPE}}, M0, N0, {{dst}});
- TILE(uint, M0, 1, g_dst_indirect_y);
+)_";
+
+ if(is_root)
{
- TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
- TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
-)_"
- // Assuming un-collapsed window
+ code +=
R"_(
- {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
- {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
-
- T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile);
- T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile);
+ TILE(uint, M0, 1, g_dst_indirect_y);
)_";
- if(is_broadcast)
- {
- code +=
+ }
+
+ if(is_lhs_input)
+ {
+ code +=
R"_(
- T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+ TILE({{DATA_TYPE}}, M0, N0, {{lhs}});
)_";
- }
- else
- {
- code +=
+ }
+
+ if(is_rhs_input)
+ {
+ code +=
R"_(
- T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+ TILE({{DATA_TYPE}}, M0, N0, {{rhs}});
)_";
- }
+ }
+
code +=
- // Calculate the destination indirect Y
R"_(
- LOOP_UNROLLING(int, i, 0, 1, M0,
{
- g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{out}}_w * {{out}}_h) - 1);
- g_dst_indirect_y[i].v += g_ind_2 * (int)({{out}}_w * {{out}}_h);
- })
- }
- //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
)_";
- }
- else // non-root
+ if(is_lhs_input)
{
- code =
+ code +=
R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_"
- // IN_0/Out(Accumulator) {{acc}}
- // IN_1(Operand) {{operand}}
- // acc = operand + acc (mix-precision, broadcast, boundary aware)
+ {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
+ T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
+)_";
+ }
+
+ if(is_rhs_input)
+ {
+ code +=
R"_(
+ {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
+ T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
+)_";
+ }
+
+ if(is_broadcast)
+ {
+ code +=
+ R"_(
+ T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
+)_";
+ }
+ else
{
- TILE(DATA_TYPE, M0, N0, operand_tile);
- T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile);
+ code +=
+ R"_(
+ T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
)_";
+ }
- if(is_broadcast)
- {
- code +=
+ if(is_root)
+ {
+ // Calculate the destination indirect Y
+ code +=
R"_(
- T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
-)_";
- }
- else
+ LOOP_UNROLLING(int, i, 0, 1, M0,
{
- code +=
-R"_(
- T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
+ g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
+ g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
+ })
)_";
- }
+ }
+
code +=
R"_(
}
//------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
)_";
- }
return code;
}
@@ -160,86 +160,105 @@ R"_(
void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_lhs,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_lhs),
"lhs");
vtable.declare_variable(
+ comp_group,
_rhs,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_rhs),
"rhs");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(common_tensor_type),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
TagLUT lut{};
- const ITensorInfo *accumulator = _lhs;
- const ITensorInfo *operand = _rhs;
// Local build options
lut["meta_kernel_id"] = id();
lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type());
// Arguments and global shared variables
- const bool is_root = (comp_group.get_root_component()->id() == this->id());
- if(is_root)
+
+ lut["lhs"] = vtable.get_variable(_lhs);
+ lut["rhs"] = vtable.get_variable(_rhs);
+ lut["dst"] = vtable.get_variable(_dst);
+ lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
+
+ switch(_attributes.operation())
{
- lut["lhs"] = vtable.get_variable(_lhs);
- lut["rhs"] = vtable.get_variable(_rhs);
- lut["dst"] = vtable.get_variable(_dst);
- lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor());
+ case Attributes::ElementwiseOp::ADD:
+ lut["ELTWISE_OP"] = "ADD";
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
}
- else
+
+ ARM_COMPUTE_ERROR_ON(
+ comp_group.is_intermediate_tensor(_lhs) &&
+ detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+ ARM_COMPUTE_ERROR_ON(
+ comp_group.is_intermediate_tensor(_rhs) &&
+ detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+
+ // Set broadcast parameters
+ // PRE: All tensors are broadcast-compatible
+ if(_lhs->tensor_shape() != _dst->tensor_shape())
{
- // Determine which tensor is the accumulator
- if(comp_group.is_intermediate_tensor(_lhs))
+ const auto is_broadcast_x = _lhs->dimension(0) == 1U && _dst->dimension(0) != 1U;
+ const auto is_broadcast_y = _lhs->dimension(1) == 1U && _dst->dimension(1) != 1U;
+ const auto is_broadcast_z = _lhs->dimension(2) == 1U && _dst->dimension(2) != 1U;
+
+ // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
+ if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed lhs win [M0xN0] = [1x1]
{
- accumulator = _lhs;
- operand = _rhs;
+ lut["lhs_m0"] = "1";
+ lut["lhs_n0"] = "1";
+ lut["lhs_start_ind_1"] = "0";
+ lut["lhs_start_ind_0"] = "0";
}
- else if(comp_group.is_intermediate_tensor(_rhs))
+ else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed lhs win [M0xN0] = [1xN]
{
- accumulator = _rhs;
- operand = _lhs;
+ lut["lhs_m0"] = "1";
+ lut["lhs_n0"] = "N0";
+ lut["lhs_start_ind_1"] = "0";
+ lut["lhs_start_ind_0"] = "g_ind_0";
}
else
{
- ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+ ARM_COMPUTE_ERROR("Only support lhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions");
}
- lut["acc"] = vtable.get_variable(accumulator);
- lut["operand"] = vtable.get_variable(operand);
}
- switch(_attributes.operation())
+ else
{
- case Attributes::ElementwiseOp::ADD:
- lut["ELTWISE_OP"] = "ADD";
- break;
- default:
- ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
+ lut["lhs_m0"] = "M0";
+ lut["lhs_n0"] = "N0";
+ lut["lhs_start_ind_1"] = "g_ind_1";
+ lut["lhs_start_ind_0"] = "g_ind_0";
}
- ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape");
- const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape());
- // Set broadcast parameters
- // PRE: All tensors are broadcast-compatible
- if(is_broadcast)
+ if(_rhs->tensor_shape() != _dst->tensor_shape())
{
+ const auto is_broadcast_x = _rhs->dimension(0) == 1U && _dst->dimension(0) != 1U;
+ const auto is_broadcast_y = _rhs->dimension(1) == 1U && _dst->dimension(1) != 1U;
+ const auto is_broadcast_z = _rhs->dimension(2) == 1U && _dst->dimension(2) != 1U;
+
// Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
- if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
+ if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
{
lut["rhs_m0"] = "1";
lut["rhs_n0"] = "1";
lut["rhs_start_ind_1"] = "0";
lut["rhs_start_ind_0"] = "0";
}
- else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
+ else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
{
lut["rhs_m0"] = "1";
lut["rhs_n0"] = "N0";
@@ -258,6 +277,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
lut["rhs_start_ind_1"] = "g_ind_1";
lut["rhs_start_ind_0"] = "g_ind_0";
}
+
return lut;
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
index 05bdd27f11..8f1ed95351 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
@@ -190,21 +190,21 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_sum,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_sum),
"sum");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
@@ -274,4 +274,4 @@ Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index a2c04d94e5..bcb6492b43 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
@@ -54,7 +54,6 @@ std::string ClTemplateResize::get_component_code(const IGpuTemplateComponentWrit
std::string code = R"_(
//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-TILE({{DST_DATA_TYPE}}, 1, N0, {{dst}});
TILE(uint, 1, 1, g_dst_indirect_y);
{
const int yo = g_ind_2 % {{arg_dst}}_h;
@@ -180,15 +179,15 @@ TILE(uint, 1, 1, g_dst_indirect_y);
void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index ef4f2f22a1..217214ced3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -62,14 +62,14 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group
void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
vtable.declare_variable(
+ comp_group,
_src,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_src),
"src");
vtable.declare_variable(
+ comp_group,
_dst,
GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- comp_group.is_intermediate_tensor(_dst),
"dst");
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index eed481f109..2ab6316947 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -191,6 +191,26 @@ std::string ClTemplateWriter::write_code()
code += write_global_section();
code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+ {
+ const auto tiles = _components.get_tiles();
+ std::stringstream tiles_ss;
+
+ tiles_ss << " //------------------ START TILE DECLARATION ---------------------\n";
+
+ for(auto tile : tiles)
+ {
+ const auto var = _vtable.get_variable(tile);
+ const auto data_type = get_cl_type_from_data_type(tile->data_type());
+ const auto var_name = var.uniq_name;
+
+ tiles_ss << " TILE(" << data_type << ", M0, N0, " << var_name << ");\n";
+ }
+
+ tiles_ss << " //------------------ END TILE DECLARATION ---------------------\n";
+
+ code += tiles_ss.str();
+ }
+
for(const auto &component_code : component_codes)
{
code += component_code;