aboutsummaryrefslogtreecommitdiff
path: root/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/experimental/dynamic_fusion/ClKernelBuildingImpl')
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h366
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h8
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp202
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h23
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp153
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h13
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp555
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h83
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h9
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp81
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h20
11 files changed, 532 insertions, 981 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index aa27572746..17437c285d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
@@ -36,6 +38,7 @@
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include <iostream>
#include <queue>
#include <stack>
#include <string>
@@ -63,8 +66,8 @@ enum class SharedVarIO
enum class SharedVarGroup
{
- Argument, // Parameters to a kernel function
- Automatic // Automatic variables declared within the kernel body
+ Argument, // Parameters to a kernel function == dst or src tensors of the whole blueprint graph
+ Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph
};
/** Specifies a shared variable link for a component.
@@ -74,85 +77,151 @@ enum class SharedVarGroup
*/
struct SharedVarLink
{
- ArgumentID arg_id{ g_arg_placeholder };
- SharedVarIO io{ SharedVarIO::Input };
- SharedVarGroup group{ SharedVarGroup::Argument };
- bool is_empty() const
+ ArgumentID arg_id{ g_arg_placeholder };
+ SharedVarIO io{ SharedVarIO::Input };
+ bool is_empty() const
{
return arg_id == g_arg_placeholder;
}
};
/** A table of all the variables used in the kernel / blueprint
+ * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) **
+ *
* NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID
* NOTE: the variables all have the scope of the full kernel function
*/
class SharedVarTable
{
public:
+ /** A fully realized SharedVarLink
+ */
struct SharedVar
{
- SharedVarGroup group;
- std::string uniq_name; // Unique name, also the final variable name used in the built code
- ClKernelArgRuntimeDescriptor desc; // Automatic variables can and should still be described using this struct
+ ArgumentID arg_id{ g_arg_placeholder };
+ SharedVarIO io{ SharedVarIO::Input };
+ SharedVarGroup group{ SharedVarGroup::Argument };
+ std::string uniq_name{}; // Unique name, also the final variable name used in the built code
+ ClKernelArgDescriptor desc{}; // Automatic variables can and should still be described using this struct
+ bool is_empty() const
+ {
+ return arg_id == g_arg_placeholder;
+ }
};
- using Arguments = std::vector<SharedVar>;
+ class Arguments
+ {
+ public:
+ Arguments() = default;
+ void add_var(const SharedVar &var)
+ {
+ ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument);
+ _vars.push_back(var);
+ }
+ std::vector<SharedVar> get_all_vars() const
+ {
+ return _vars;
+ }
+ std::vector<SharedVar> get_src_vars() const
+ {
+ std::vector<SharedVar> src_vars;
+ std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var)
+ {
+ return var.io == SharedVarIO::Input;
+ });
+ return src_vars;
+ }
+ SharedVar get_dst_var() const
+ {
+ std::vector<SharedVar> dst_vars;
+ std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var)
+ {
+ return var.io == SharedVarIO::Output;
+ });
+ ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1);
+ return dst_vars.at(0);
+ }
+
+ private:
+ std::vector<SharedVar> _vars{};
+ };
- /** @note: The order of insertion is important. There is one precondition:
+ /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one
+ *
+ * @note: The order of insertion is important. There is one precondition:
* PRECOND: The components have been sorted topologically / is being traversed in topological order
* This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred
*/
- SharedVar add(SharedVarLink var_link, ClKernelArgRuntimeDescriptor runtime_desc, const std::string &name = "unnamed")
+ void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed")
{
ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected");
+ if(!get(var_link).is_empty())
+ {
+ return;
+ }
+
auto var_id = _num_var;
std::stringstream ss;
ss << name << "_" << var_id;
const auto uniq_name = ss.str();
- SharedVar var{ var_link.group, uniq_name, runtime_desc };
+ SharedVar var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc };
- if(var_link.group == SharedVarGroup::Argument)
+ if(group == SharedVarGroup::Argument)
{
_arguments.emplace(var_id, var);
+ _arg_id_map.emplace(var_link.arg_id, var_id);
_num_var++;
- _var_id_lut[var_link.arg_id] = var_id;
}
- else if(var_link.group == SharedVarGroup::Automatic)
+ else if(group == SharedVarGroup::Automatic)
{
- if(var_link.io == SharedVarIO::Output)
+ if(_global_vars.empty())
{
- _global_vars.emplace(var_id, var);
- _num_var++;
- _var_id_lut[var_link.arg_id] = var_id;
+ if(var_link.io == SharedVarIO::Output)
+ {
+ _global_vars.emplace(var_id, var);
+ _arg_id_map.emplace(var_link.arg_id, var_id);
+ _num_var++;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Component likely not traversed in topological order");
+ }
}
else
{
- // For the input link, the var (and thus its arg_id) will always have been added by the time we get here if we traverse components in topological order
- var = get_var(var_link.arg_id);
+ // Associate additional SharedVarLinks with the single global shared variable
+ const auto global_var_id = _global_vars.begin()->first;
+ _arg_id_map[var_link.arg_id] = global_var_id;
}
}
else
{
ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup");
}
- return var;
}
- SharedVar get_var(ArgumentID arg_id) const
+ /** Get the SharedVar associated with @p var_link
+ *
+ * @param var_link
+ * @return SharedVar
+ */
+ SharedVar get(const SharedVarLink &var_link) const
{
- const auto var_id = _var_id_lut.at(arg_id); // arg_id has to exist in lut to begin with
- auto it = _global_vars.find(var_id);
- if(it != _global_vars.end())
- {
- return it->second;
- }
- it = _arguments.find(var_id);
- if(it != _arguments.end())
+ const SharedVar empty_var{};
+ if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end())
{
- return it->second;
+ const auto var_id = _arg_id_map.at(var_link.arg_id);
+ const auto arg_var = _arguments.find(var_id);
+ if(arg_var != _arguments.end())
+ {
+ return arg_var->second;
+ }
+ else
+ {
+ return _global_vars.at(var_id);
+ }
}
- ARM_COMPUTE_ERROR("Cannot find component variable");
+ return empty_var;
}
/** @note The arguments are returned in the order they are added
@@ -162,7 +231,7 @@ public:
Arguments args{};
for(const auto &a : _arguments)
{
- args.push_back(a.second);
+ args.add_var(a.second);
}
return args;
}
@@ -171,9 +240,9 @@ private:
using VarID = int32_t;
private:
- std::map<VarID, SharedVar> _global_vars{};
- std::map<VarID, SharedVar> _arguments{};
- std::unordered_map<ArgumentID, VarID> _var_id_lut{};
+ std::map<VarID, SharedVar> _global_vars{}; // Shared, global variable
+ std::map<VarID, SharedVar> _arguments{};
+ std::map<ArgumentID, VarID> _arg_id_map{}; // Track ArgumentIDs that have already been added
VarID _num_var{ 0 };
};
@@ -184,7 +253,7 @@ enum class ComponentType
Store
};
-using ComponentID = int32_t;
+using ComponentID = DependencyGraph::Id;
using ComponentList = std::vector<ComponentID>;
class IClKernelComponent
{
@@ -224,7 +293,7 @@ public:
};
using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
public:
- IClKernelComponent(const ClKernelBlueprint *blueprint)
+ IClKernelComponent(ClKernelBlueprint *blueprint)
: _blueprint(blueprint)
{
}
@@ -304,12 +373,18 @@ public:
{
return Window{};
}
- /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code
+ /** Get the tag look-up table used to instantiate the component code.
*
* @param vtable
* @return TagLUT
*/
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const = 0;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0;
+
+ /** Allocate all shared variables used by the component in the @p vtable
+ *
+ * @param vtable
+ */
+ virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0;
virtual std::string get_dst_addr_calculation() const
{
@@ -331,7 +406,7 @@ public:
}
protected:
- const ClKernelBlueprint *_blueprint;
+ ClKernelBlueprint *_blueprint;
private:
ComponentID _id{};
@@ -348,18 +423,19 @@ public:
~Implementation() = default;
public:
- ArgumentID add_kernel_argument(const ClTensorDescriptor &tensor_desc)
+ Status update_merge_point(ArgumentID t_id, ArgumentID merge_point)
{
- _kernel_arguments.insert(std::make_pair(_num_args, tensor_desc));
- _shared_var_group_lut[_num_args] = SharedVarGroup::Argument;
- return _num_args++;
+ return _graph.update_merge_point(t_id, merge_point);
}
- ArgumentID add_intermediate_tensor()
+ ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id())
{
- _intermediate_tensors.insert(_num_args);
- _shared_var_group_lut[_num_args] = SharedVarGroup::Automatic;
- return _num_args++;
+ const auto id = _graph.add_tensor(merge_point);
+ if(_kernel_tensors.find(id) == _kernel_tensors.end())
+ {
+ _kernel_tensors.insert(std::make_pair(id, tensor_info));
+ }
+ return id;
}
void set_tile_info(const TileDescriptor &tile_info)
@@ -382,7 +458,7 @@ public:
for(const auto arg_id : args)
{
ARM_COMPUTE_UNUSED(arg_id);
- ARM_COMPUTE_ERROR_ON_MSG(_kernel_arguments.find(arg_id) == _kernel_arguments.end() && _intermediate_tensors.find(arg_id) == _intermediate_tensors.end() && arg_id != g_arg_placeholder,
+ ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder,
"Trying to use an argument that hasn't been added to the blueprint");
}
}
@@ -395,29 +471,36 @@ public:
ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported.");
}
- // This flag specifies if the current component is the root of the component graph
- // If the root is set to -1, it means that a root hasn't been added yet
- bool is_graph_root = true;
-
// Get an unique ID for the component that's being added
- const ComponentID component_id = _num_components++;
+ std::vector<ArgumentID> src_tensors;
+ std::vector<ArgumentID> dst_tensors;
+ for(const auto &link : component->get_links())
+ {
+ if(link.is_empty())
+ {
+ continue;
+ }
+ if(link.io == SharedVarIO::Input)
+ {
+ src_tensors.push_back(link.arg_id);
+ }
+ else
+ {
+ dst_tensors.push_back(link.arg_id);
+ }
+ }
+ const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second;
component->set_id(component_id);
// Add this component to the component graph. Don't connect it to anything yet
_component_graph.emplace(component_id, ComponentList{});
- int32_t positional_arg = 0;
-
// For every { arg_id, arg_io } passed along with this component...
for(const auto &link : component->get_links())
{
const ArgumentID &arg_id = link.arg_id;
const SharedVarIO &arg_io = link.io;
- // A component is considered root only if all its input arguments are kernel arguments (or placeholders, which means nullptr)
- // This performs a check on every argument, and if one of them doesn't respect the condition, the component is not considered root
- is_graph_root &= (_kernel_arguments.find(arg_id) != _kernel_arguments.end()) || (arg_io == SharedVarIO::Output) || (arg_id == g_arg_placeholder);
-
// Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there
if(_outgoing_components.find(arg_id) == _outgoing_components.end())
{
@@ -454,15 +537,9 @@ public:
_incoming_components[arg_id].push_back(component_id);
}
-
- ++positional_arg;
}
- if(is_graph_root)
- {
- ARM_COMPUTE_ERROR_ON_MSG(_graph_root >= 0, "Trying to add more than one root to the graph");
- _graph_root = component_id;
- }
+ ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph");
// Finally, add this component to the dictionary of components
_components.insert(std::make_pair(component_id, std::move(component)));
@@ -489,17 +566,28 @@ public:
std::set<std::string> additional_macros{};
std::vector<std::string> component_codes{}; // vector because order matters
- // Go through the components graph (topological sort) and fill the data structures above
+ // Step 1: Allocate all kernel argument shared variables before generating the component code
auto stack = topological_sort();
while(!stack.empty())
{
auto curr_component_id = stack.top();
auto &curr_component = _components.find(curr_component_id)->second;
+ curr_component->allocate_shared_vars(_vtable);
+
+ stack.pop();
+ }
+ // Step 2: Generate component codes
+ stack = topological_sort();
+ while(!stack.empty())
+ {
+ auto curr_component_id = stack.top();
+ auto &curr_component = _components.find(curr_component_id)->second;
+
auto curr_headers_list = curr_component->get_headers_list();
auto curr_additional_macros = curr_component->get_additional_macros();
auto curr_component_code = curr_component->get_component_code();
- const auto var_lut = curr_component->allocate_vars(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+ const auto var_lut = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut));
headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
@@ -511,7 +599,7 @@ public:
stack.pop();
}
- // This section assembles the data gathered by traversing the graph into the string "code"
+ // Step 3: Assemble the data gathered by traversing the graph into the string "code"
std::string code = "";
for(auto &header : headers_list)
@@ -596,34 +684,79 @@ public:
ClKernelArgList get_arguments() const
{
ClKernelArgList arg_list{};
- for(const auto &arg_var : _vtable.get_kernel_arguments())
+ for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars())
{
- arg_list.push_back(arg_var.desc);
+ arg_list[arg_var.desc.arg_id] = arg_var.desc;
}
return arg_list;
}
- const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const
+ /** Get the arguments as shared vars from the vtable
+ *
+ * @return SharedVarTable::Arguments
+ */
+ SharedVarTable::Arguments get_argument_shared_vars() const
+ {
+ return _vtable.get_kernel_arguments();
+ }
+
+ const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
{
- auto it = _kernel_arguments.find(id);
- if(it != _kernel_arguments.end())
+ auto it = _kernel_tensors.find(id);
+ if(it != _kernel_tensors.end())
{
- return &_kernel_arguments.find(id)->second;
+ return it->second;
}
return nullptr;
}
- ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+ ITensorInfo *get_kernel_argument_info(const ArgumentID id)
{
- const ClTensorDescriptor *arg_desc = get_kernel_argument(id);
- if(arg_desc != nullptr)
+ auto it = _kernel_tensors.find(id);
+ if(it != _kernel_tensors.end())
{
- return arg_desc->tensor_info;
+ return it->second;
}
return nullptr;
}
+ /** Finalize graph construction. Graph is expected to not mutate after being finalized
+ */
+ void finalize()
+ {
+ cache_root_component();
+ assign_shared_var_group();
+ }
+
+ DependencyGraph get_graph() const
+ {
+ return _graph;
+ }
private:
+ void cache_root_component()
+ {
+ const auto roots = _graph.get_root_ops();
+ ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph");
+ _graph_root = roots.at(0);
+ }
+ /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building
+ */
+ void assign_shared_var_group()
+ {
+ for(const auto &tensor : _kernel_tensors)
+ {
+ const auto tensor_id = tensor.first;
+ if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id))
+ {
+ _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument;
+ }
+ else
+ {
+ _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic;
+ }
+ }
+ }
+
void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
{
visited.insert(component_id);
@@ -666,41 +799,41 @@ private:
std::string code;
switch(var.desc.tensor_arg_type)
{
- case TensorArgType::Vector:
+ case ClKernelTensorArgType::Vector:
{
code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")";
break;
}
- case TensorArgType::Image:
+ case ClKernelTensorArgType::Image:
{
code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")";
break;
}
- case TensorArgType::Image_3D:
+ case ClKernelTensorArgType::Image_3D:
{
code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),";
code += "\n uint " + var.uniq_name + "_stride_z";
break;
}
- case TensorArgType::Image_3D_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
{
code += "\n __read_only image2d_t " + var.uniq_name + "_img,";
code += "\n uint " + var.uniq_name + "_stride_z";
break;
}
- case TensorArgType::Tensor_4D_t_Buffer:
+ case ClKernelTensorArgType::Tensor_4D_t_Buffer:
{
code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
break;
}
- case TensorArgType::Tensor_4D_t_Image:
+ case ClKernelTensorArgType::Tensor_4D_t_Image:
{
code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
break;
}
default:
{
- ARM_COMPUTE_ERROR("Unsupported declaration generation for TensorArgType");
+ ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType");
}
}
return code;
@@ -710,7 +843,7 @@ private:
{
std::string code = "\n__kernel void " + build_kernel_name() + "(";
- for(const auto &arg : argument_list)
+ for(const auto &arg : argument_list.get_all_vars())
{
code += generate_argument_declaration(arg) + ",";
}
@@ -722,54 +855,55 @@ private:
std::string generate_global_section() const
{
- std::string code = "";
- code += " uint g_x = get_global_id(0);\n";
- code += " uint g_y = get_global_id(1);\n";
- code += " uint g_z = get_global_id(2);\n\n";
+ auto dst_info = get_kernel_argument_info(_dst_id);
+ auto dst_w = dst_info->dimension(0);
+ auto dst_h = dst_info->dimension(1);
+ const auto tile_w = std::max(1, get_execution_window().x().step());
+ const auto tile_h = std::max(1, get_execution_window().y().step());
+ auto leftover_w = dst_w % tile_w;
+ auto leftover_h = dst_h % tile_h;
- size_t tile_dim_x = _tile_info.empty() ? 1 : _tile_info.tile_dims.x();
- size_t tile_dim_y = _tile_info.empty() ? 1 : _tile_info.tile_dims.y();
+ std::string code = "";
+ code += std::string(" int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+ code += std::string(" int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + std::to_string(leftover_h) + ");\n";
+ code += std::string(" int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n");
switch(_tile_info.clipping)
{
case ClippingStrategy::TOP_LEFT:
- code += " const bool g_cond_x = (g_x == 0);\n";
- code += " const bool g_cond_y = (g_y == 0);\n";
+ code += " const bool g_cond_x = (cout == 0);\n";
+ code += " const bool g_cond_y = (mout == 0);\n";
break;
case ClippingStrategy::TOP_RIGHT:
- code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
- code += " const bool g_cond_y = (g_y == 0);\n";
+ code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
+ code += " const bool g_cond_y = (mout == 0);\n";
break;
case ClippingStrategy::BOTTOM_LEFT:
- code += " const bool g_cond_x = (g_x == 0);\n";
- code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
+ code += " const bool g_cond_x = (cout == 0);\n";
+ code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
break;
case ClippingStrategy::BOTTOM_RIGHT:
- code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
- code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
+ code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
+ code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
break;
default:
ARM_COMPUTE_ERROR("Unsupported clipping strategy");
}
- code += "\n REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n";
- code += " REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n";
-
return code;
}
TileDescriptor _tile_info{};
- int32_t _num_args{};
- int32_t _num_components{};
int32_t _num_complex_components{};
ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
- // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
+ DependencyGraph _graph{};
+
+ // Tensors, components and IDs with corresponding ptrs (except intermediate)
std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
- std::unordered_map<ArgumentID, ClTensorDescriptor> _kernel_arguments{};
- std::unordered_set<ArgumentID> _intermediate_tensors{};
+ std::unordered_map<ArgumentID, ITensorInfo *> _kernel_tensors{};
// Argument group lookup. Can be replaced by extending the ArgumentID type to include group info
std::unordered_map<ArgumentID, SharedVarGroup> _shared_var_group_lut{};
@@ -794,6 +928,4 @@ private:
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
index 41ab4e320b..d4feac7da9 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
@@ -72,6 +74,4 @@ inline std::string to_string(const ClKernelCode &code)
} // namespace experimental
} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
index f951ce3d46..11fb1d53d0 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
@@ -31,6 +33,7 @@
#include "src/core/helpers/WindowHelpers.h"
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
namespace arm_compute
{
namespace experimental
@@ -44,7 +47,7 @@ ComponentType ClDirectConvolutionKernelComponent::get_component_type() const
std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const
{
- return std::set<std::string> { "helpers.h", "tile_helpers.h", "repeat.h" };
+ return std::set<std::string> { "helpers.h", "tile_helpers.h" };
}
Window ClDirectConvolutionKernelComponent::get_window() const
@@ -54,7 +57,17 @@ Window ClDirectConvolutionKernelComponent::get_window() const
auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
// Get dst shape
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info);
+ PadStrideInfo pad_stride_info
+ {
+ static_cast<unsigned int>(_desc.conv2d.stride.x()),
+ static_cast<unsigned int>(_desc.conv2d.stride.y()),
+ static_cast<unsigned int>(_desc.conv2d.pad.left),
+ static_cast<unsigned int>(_desc.conv2d.pad.right),
+ static_cast<unsigned int>(_desc.conv2d.pad.top),
+ static_cast<unsigned int>(_desc.conv2d.pad.bottom),
+ DimensionRoundingType::FLOOR /*default rounding type*/
+ };
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info);
// Output auto initialization if not yet initialized
auto_init_if_empty(*dst_info, output_shape,
@@ -64,6 +77,9 @@ Window ClDirectConvolutionKernelComponent::get_window() const
const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+ // const unsigned int num_rows = 1;
+ // const unsigned int vec_size = tile_info.tile_dims.x();
+ // const unsigned int num_rows = tile_info.tile_dims.y();
// Create and configure kernel window
Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
@@ -95,27 +111,30 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
//------------------ START KERNEL {{meta_kernel_id}} ---------------------
// IN_0(src) {{src}}
// IN_1(wei) {{weight}}
+ )_";
+ if(bias_info != nullptr)
+ {
+ code += R"_(
// IN_1(bia) {{bias}}
+ )_";
+ }
+ code += R"_(
// OUT(dst, accum) {{dst}}
- const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
- const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
// Initialize the accumulators
TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
{
// All the tensor dimensions are passed at compile time.
// In case of dynamic tensor support, the following dimensions should be passed as function argument.
- #define _I{{WEI_WIDTH}} {{WEI_WIDTH}}
- #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}}
+ #define _IWEI_WIDTH {{WEI_WIDTH}}
+ #define _IWEI_HEIGHT {{WEI_HEIGHT}}
#define _ISRC_WIDTH {{src}}_w
#define _ISRC_HEIGHT {{src}}_h
#define _ISRC_CHANNELS {{src}}_c
- #define _IDST_WIDTH {{dst_w}}
- #define _IDST_HEIGHT {{dst_h}}
- #define _IDST_CHANNELS {{dst_c}}
- #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}})
+ #define _IDST_WIDTH {{arg_dst}}_w
+ #define _IDST_HEIGHT {{arg_dst}}_h
+ #define _IDST_CHANNELS {{arg_dst}}_c
+ #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
// .v = access the whole vector (OpenCL vector)
// .s[x] = access the vector element at position x (scalar access)
@@ -136,13 +155,11 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
{{dst}}[i].v = 0;
})
- uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0);
-
- for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i)
+ for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
{
int ck = 0;
- int xk = i % _I{{WEI_WIDTH}};
- int yk = i / _I{{WEI_WIDTH}};
+ int xk = i % _IWEI_WIDTH;
+ int yk = i / _IWEI_HEIGHT;
int k = 0;
for(; k <= (_ISRC_CHANNELS - K0); k += K0)
@@ -201,6 +218,16 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
}
code += R"_(
+ #undef _I_WEI_WIDTH
+ #undef _I_WEI_HEIGHT
+ #undef _ISRC_WIDTH
+ #undef _ISRC_HEIGHT
+ #undef _ISRC_CHANNELS
+ #undef _IDST_WIDTH
+ #undef _IDST_HEIGHT
+ #undef _IDST_CHANNELS
+ #undef _IY_MULTIPLIER
+
}
)_";
@@ -217,44 +244,7 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
}
code += R"_(
- #undef _I{{WEI_WIDTH}}
- #undef _I{{WEI_HEIGHT}}
- #undef _ISRC_WIDTH
- #undef _ISRC_HEIGHT
- #undef _ISRC_CHANNELS
- #undef _IDST_WIDTH
- #undef _IDST_HEIGHT
- #undef _IDST_CHANNELS
- #undef _IY_MULTIPLIER
}
-
- // Workaround for the discrepancy between tiles and repeats
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v;
-#if M0 >= 2
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v;
-#endif // M0 >= 2
-#if M0 >= 3
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v;
-#endif // M0 >= 3
-#if M0 >= 4
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v;
-#endif // M0 >= 4
-#if M0 >= 8
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v;
-#endif // M0 >= 8
-#if M0 == 16
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v;
-#endif // M0 == 16
//------------------ END KERNEL {{meta_kernel_id}} ---------------------
)_";
return code.c_str();
@@ -306,19 +296,18 @@ bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target,
CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const
{
const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+ auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ // const auto tile_info = _blueprint->impl().get_tile_info();
const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
const DataType data_type = src_info->data_type();
- const GPUTarget gpu_target = ICLKernel().get_target();
-
- Window win = get_window();
+ const GPUTarget gpu_target = CLScheduler::get().target();
- const unsigned int n0 = win.x().step();
- const unsigned int m0 = win.y().step();
+ const unsigned int n0 = _blueprint->impl().get_execution_window().x().step();
+ const unsigned int m0 = _blueprint->impl().get_execution_window().y().step();
const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
- const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0;
+ const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
// Update the padding for the weights tensor if we can export to cl_image
@@ -338,54 +327,79 @@ CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() cons
return build_opts;
}
-ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+ const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+ const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+
+ vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
+ const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer;
+ vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight");
+
+ if(!_bias.is_empty()) // optional bias
+ {
+ vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias");
+ }
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
{
TagLUT lut{};
const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
- const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- const GPUTarget gpu_target = ICLKernel().get_target();
- const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
- const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer;
- lut["meta_kernel_id"] = id();
- lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src");
- lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight");
+ // Arguments and global shared variables
+ lut["src"] = vtable.get(_src);
+ lut["weight"] = vtable.get(_weight);
if(!_bias.is_empty()) // optional bias
{
- lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias");
+ lut["bias"] = vtable.get(_bias);
lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
}
- lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
-
- // Local build options
- const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
- const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
- const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
+ lut["dst"] = vtable.get(_dst);
- lut["dst_w"] = dst_info->dimension(width_idx);
- lut["dst_h"] = dst_info->dimension(height_idx);
- lut["dst_c"] = dst_info->dimension(channel_idx);
+ const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var();
+ lut["arg_dst"] = dst_argument.uniq_name;
- lut["ACC_DATA_TYPE"] = src_info->data_type();
- lut["SRC_DATA_TYPE"] = src_info->data_type();
- lut["WEI_DATA_TYPE"] = weight_info->data_type();
+ // Local build options
+ lut["meta_kernel_id"] = id();
+ lut["ACC_DATA_TYPE"] = src_info->data_type();
+ lut["SRC_DATA_TYPE"] = src_info->data_type();
+ lut["WEI_DATA_TYPE"] = weight_info->data_type();
lut["SRC_TENSOR_TYPE"] = "BUFFER";
- lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER";
-
- lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
- lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
+ switch(vtable.get(_weight).desc.tensor_arg_type)
+ {
+ case ClKernelTensorArgType::Image_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Tensor_4D_t_Image:
+ {
+ lut["WEI_TENSOR_TYPE"] = "IMAGE";
+ break;
+ }
+ default:
+ {
+ lut["WEI_TENSOR_TYPE"] = "BUFFER";
+ break;
+ }
+ }
+ const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
+ const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
+ lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
+ lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
- lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride());
- lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride());
+ lut["STRIDE_X"] = _desc.conv2d.stride.x();
+ lut["STRIDE_Y"] = _desc.conv2d.stride.y();
- lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left();
- lut["PAD_TOP"] = _desc.pad_stride_info.pad_top();
+ lut["PAD_LEFT"] = _desc.conv2d.pad.left;
+ lut["PAD_TOP"] = _desc.conv2d.pad.top;
lut["ZERO_VALUE"] = 0;
@@ -393,6 +407,4 @@ ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::a
}
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
index 10c0e00a58..af9a65debc 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
@@ -39,7 +41,7 @@ namespace dynamic_fusion
class ClDirectConvolutionKernelComponent : public IClKernelComponent
{
public:
- ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc,
+ ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc,
const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{})
: IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst }
{
@@ -58,7 +60,8 @@ public:
return { _src, _weight, _bias, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::string name() const override
{
@@ -66,16 +69,14 @@ public:
}
private:
- DirectConvolutionDescriptor _desc{};
- Link _src{};
- Link _weight{};
- Link _bias{};
- Link _dst{};
+ ClDirectConv2dKernelDescriptor _desc{};
+ Link _src{};
+ Link _weight{};
+ Link _bias{};
+ Link _dst{};
};
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 84e4003d5d..2bbea8725d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
#include "arm_compute/core/Validate.h"
@@ -41,7 +43,7 @@ ComponentType ClElementwiseAddKernelComponent::get_component_type() const
std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
{
- return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" };
+ return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
}
Window ClElementwiseAddKernelComponent::get_window() const
@@ -67,63 +69,62 @@ Window ClElementwiseAddKernelComponent::get_window() const
std::string ClElementwiseAddKernelComponent::get_component_code() const
{
std::string code;
- return R"_(
+ const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+
+ if(is_root)
+ {
+ return R"_(
//------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
- // IN_0(Accumulator) {{acc}}
- // IN_1(Addend) {{addend}}
+ // IN_0(LHS) {{lhs}}
+ // IN_1(RHS) {{rhs}}
+ // OUT(dst, accum) {{dst}}
- // c = addend + c (mix-precision, broadcast, boundary aware)
+ // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
+ TILE({{DATA_TYPE}}, M0, N0, {{dst}});
{
- __global uchar *addend_addr = {{addend}}_ptr + {{addend}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{addend}}_stride_y) + get_global_id(2) * {{addend}}_stride_z; \
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp);
- }
+ TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
+ TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
- // Workaround for the discrepancy between tiles and repeats
-#if defined(IS_TILED)
- {{acc}}[0].v = {{acc}}0;
-#if M0 >= 2
- {{acc}}[1].v = {{acc}}1;
-#endif // M0 >= 2
-#if M0 >= 3
- {{acc}}[2].v = {{acc}}2;
-#endif // M0 >= 3
-#if M0 >= 4
- {{acc}}[3].v = {{acc}}3;
-#endif // M0 >= 4
-#if M0 >= 8
- {{acc}}[4].v = {{acc}}4;
- {{acc}}[5].v = {{acc}}5;
- {{acc}}[6].v = {{acc}}6;
- {{acc}}[7].v = {{acc}}7;
-#endif // M0 >= 8
-#if M0 == 16
- {{acc}}[8].v = {{acc}}8;
- {{acc}}[9].v = {{acc}}9;
- {{acc}}[10].v = {{acc}}A;
- {{acc}}[11].v = {{acc}}B;
- {{acc}}[12].v = {{acc}}C;
- {{acc}}[13].v = {{acc}}D;
- {{acc}}[14].v = {{acc}}E;
- {{acc}}[15].v = {{acc}}F;
-#endif // M0 == 16
-#endif // defined(IS_TILED)
+ T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile);
+ T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile);
+
+ T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+ }
//------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+)_";
+ }
+ else
+ {
+ return R"_(
+ //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+ // IN_0/Out(Accumulator) {{acc}}
+ // IN_1(Addend) {{addend}}
+ // acc = addend + acc (mix-precision, broadcast, boundary aware)
+ {
+ TILE({{DATA_TYPE}}, M0, N0, addend_tile);
+
+ T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile);
+
+ T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+ }
+ //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
)_";
+ }
}
CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- auto tile_info = _blueprint->impl().get_tile_info();
+ const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
CLBuildOptions build_opts{};
+ const auto n0 = _blueprint->impl().get_execution_window().x().step();
+ const auto m0 = _blueprint->impl().get_execution_window().y().step();
+ const auto partial_m0 = t_dst_info->dimension(1) % m0;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
return build_opts;
}
@@ -142,34 +143,56 @@ std::string ClElementwiseAddKernelComponent::generate_config_id() const
return config_id;
}
-ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
{
- // Determine which argument is the accumulator
- Link accumulator;
- Link addend;
- if(_lhs.group == SharedVarGroup::Automatic)
+ const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+ vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
+ vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs");
+ if(is_root)
{
- accumulator = _lhs;
- addend = _rhs;
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
}
- else if(_rhs.group == SharedVarGroup::Automatic)
+}
+
+ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+{
+ TagLUT lut{};
+ const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ // Arguments and global shared variables
+ const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+ if(is_root)
{
- accumulator = _rhs;
- addend = _lhs;
+ lut["lhs"] = vtable.get(_lhs);
+ lut["rhs"] = vtable.get(_rhs);
+ lut["dst"] = vtable.get(_dst);
}
else
{
- ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+ // Determine which link is the accumulator
+ Link accumulator;
+ Link addend;
+ if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic)
+ {
+ accumulator = _lhs;
+ addend = _rhs;
+ }
+ else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic)
+ {
+ accumulator = _rhs;
+ addend = _lhs;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+ }
+ lut["acc"] = vtable.get(accumulator);
+ lut["addend"] = vtable.get(addend);
}
- return {
- { "meta_kernel_id", id() },
- { "acc", vtable.add(accumulator, ClKernelArgRuntimeDescriptor(accumulator.arg_id, TensorArgType::Image_3D), "add_acc") },
- { "addend", vtable.add(addend, ClKernelArgRuntimeDescriptor(addend.arg_id, TensorArgType::Image_3D), "add_addend") },
- // {"dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst")}, // dst is needed for the root version and/or non-inplace version should we need one
- };
+ // Local build options
+ lut["meta_kernel_id"] = id();
+ lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type());
+ return lut;
}
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index 35c9538b8d..4f7b69724d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
@@ -37,7 +39,7 @@ namespace dynamic_fusion
class ClElementwiseAddKernelComponent : public IClKernelComponent
{
public:
- ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
+ ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
: IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
{
}
@@ -54,7 +56,8 @@ public:
return { _lhs, _rhs, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::string name() const override
{
@@ -70,6 +73,4 @@ private:
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
deleted file mode 100644
index 45b81b424d..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClGemmNativeKernelComponent::get_component_type() const
-{
- return ComponentType::Complex;
-}
-
-std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const
-{
- return std::set<std::string> { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" };
-}
-
-Window ClGemmNativeKernelComponent::get_window() const
-{
- ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
- ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
-
- bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and dst have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation
- GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d,
- _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width,
- _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset);
-
- // dst tensor auto initialization if not yet initialized
- auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info)));
-
- TensorInfo tmp_info(*dst_info);
-
- if(reinterpret_output_as_3d)
- {
- // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(dst_info->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
- win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
-
- AccessWindowStatic src0_access(lhs_info, 0, 0,
- lhs_info->dimension(0),
- lhs_info->dimension(1));
- AccessWindowStatic src1_access(rhs_info, 0, 0,
- ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0),
- rhs_info->dimension(1));
- AccessWindowStatic dst_access(dst_info, 0, 0,
- dst_info->dimension(0),
- dst_info->dimension(1));
-
- if(bias_info != nullptr)
- {
- const int bias_processed_per_iteration_x = _desc.rhs_info.n0;
-
- AccessWindowStatic src2_access(bias_info, 0, 0,
- ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x),
- bias_info->dimension(1));
-
- window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- if(window_changed == true)
- {
- ARM_COMPUTE_ERROR("Insufficient Padding!");
- }
-
- return collapsed;
-}
-
-std::string ClGemmNativeKernelComponent::get_additional_macros() const
-{
- return R"_(
-#define VFMA(a, b, c) \
-({ \
- c = fma(a, b, c); \
-})
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
- })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-)_";
-}
-
-std::string ClGemmNativeKernelComponent::get_component_code() const
-{
- auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
-
- auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha));
- auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
- auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
-
- std::string code = R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ---------------------
- // IN_0(lhs) {{lhs}}
- // IN_1(rhs) {{rhs}}
- )_";
-
- if(!_bias.is_empty())
- {
- code += R"_(
- // IN_2(bias) {{bias}}
- )_";
- }
-
- code += R"_(
- // OUT(dst, accum) {{dst}}
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), {{dst}}, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
- {
-#if defined(DUMMY_WORK_ITEMS)
- if((g_x * N0 >= N) || (g_y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = {{lhs}}_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * (uint){{lhs}}_stride_y;
-
- // Compute RHS matrix address
- uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE);
- )_";
-
- if(dont_slide_b)
- {
- code += R"_(
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z;
- )_";
- }
- else
- {
- code += R"_(
- rhs_offset += g_z * {{rhs}}_stride_z;
- )_";
- }
-
- code += R"_(
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
- )_";
-
- if(reinterpret_input_as_3d)
- {
- code += R"_(
- // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}};
- )_";
- }
- else
- {
- code += R"_(
- // Add offset for batched GEMM
- lhs_offset += g_z * {{lhs}}_stride_z;
- )_";
- }
-
- code += R"_(
- int i = 0;
-#if {{K0}} > 1
- for(; i <= (K - {{K0}}); i += {{K0}})
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
-
- // Load values from RHS matrix
- LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
-
- RHS_VFMA_M0xN0(0, a, b0, {{dst}});
- RHS_VFMA_M0xN0(1, a, b1, {{dst}});
-#if {{K0}} > 2
- RHS_VFMA_M0xN0(2, a, b2, {{dst}});
-#endif // K0 > 2
-#if {{K0}} > 3
- RHS_VFMA_M0xN0(3, a, b3, {{dst}});
-#endif // K0 > 3
-#if {{K0}} > 4
- RHS_VFMA_M0xN0(4, a, b4, {{dst}});
- RHS_VFMA_M0xN0(5, a, b5, {{dst}});
- RHS_VFMA_M0xN0(6, a, b6, {{dst}});
- RHS_VFMA_M0xN0(7, a, b7, {{dst}});
-#endif // K0 > 4
-#if {{K0}} > 8
- RHS_VFMA_M0xN0(8, a, b8, {{dst}});
- RHS_VFMA_M0xN0(9, a, b9, {{dst}});
- RHS_VFMA_M0xN0(A, a, bA, {{dst}});
- RHS_VFMA_M0xN0(B, a, bB, {{dst}});
- RHS_VFMA_M0xN0(C, a, bC, {{dst}});
- RHS_VFMA_M0xN0(D, a, bD, {{dst}});
- RHS_VFMA_M0xN0(E, a, bE, {{dst}});
- RHS_VFMA_M0xN0(F, a, bF, {{dst}});
-#endif // K0 > 8
-
- lhs_offset += {{K0}} * sizeof(DATA_TYPE);
- rhs_offset += {{K0}} * {{rhs}}_stride_y;
- }
-#endif // K0 > 1
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 0 * {{lhs}}_stride_y + zlhs0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 1 * {{lhs}}_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 2 * {{lhs}}_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 3 * {{lhs}}_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 4 * {{lhs}}_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 5 * {{lhs}}_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 6 * {{lhs}}_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 7 * {{lhs}}_stride_y + zlhs7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b = VLOAD(N0)(0, (__global DATA_TYPE *)({{rhs}}_ptr + rhs_offset + 0 * {{rhs}}_stride_y));
- RHS_VFMA_M0xN0(0, a, b, {{dst}});
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += {{rhs}}_stride_y;
- }
-
- // Multiply by the weight of matrix-matrix product and store the result
- )_";
- if(has_alpha)
- {
- code += R"_(
- SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}});
- )_";
- }
-
- if(!_bias.is_empty())
- {
- if(_desc.broadcast_bias)
- {
- code += R"_(
- // Add beta*bias
- __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
- )_";
-
- if(helpers::float_ops::is_one(_desc.beta))
- {
- code += R"_(
- SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}});
- )_";
- }
-
- code += R"_(
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
- )_";
- }
- else
- {
- code += R"_(
- // Add beta*bias
- __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
- PARTIAL_STORE_M0)
- * {{bias}}_stride_y)
- + g_z * {{bias}}_stride_z;
-
- LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
- )_";
-
- if(helpers::float_ops::is_one(_desc.beta))
- {
- code += R"_(
- SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}});
- )_";
- }
-
- code += R"_(
- // c = c + bias
- ADD_BLOCK(M0, {{dst}}, bias);
- )_";
- }
- }
-
- code += R"_(
- }
- //------------------ END KERNEL {{meta_kernel_id}} ---------------------
- )_";
- return code.c_str();
-}
-
-CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- auto tile_info = _blueprint->impl().get_tile_info();
-
- CLBuildOptions build_opts{};
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y()));
- build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x()));
- build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
-
- return build_opts;
-}
-
-std::string ClGemmNativeKernelComponent::generate_config_id() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- std::string config_id{};
- config_id += (_bias.is_empty() ? "add_bias_" : "");
- config_id += (_desc.broadcast_bias ? "broadcast_bias_" : "");
- config_id += (_desc.reinterpret_input_as_3d ? "3di_" : "");
- config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : "");
- config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(1));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(0));
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.k);
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(2));
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.lhs_info.m0);
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.rhs_info.n0);
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.rhs_info.k0);
- return config_id;
-}
-
-ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
-{
- TagLUT lut{};
-
- lut["meta_kernel_id"] = id();
- lut["lhs"] = vtable.add(_lhs, ClKernelArgRuntimeDescriptor(_lhs.arg_id, TensorArgType::Image_3D), "lhs");
- lut["rhs"] = vtable.add(_rhs, ClKernelArgRuntimeDescriptor(_rhs.arg_id, TensorArgType::Image_3D), "rhs");
- if(!_bias.is_empty()) // optional bias
- {
- lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias");
- }
- lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst");
-
- // Local build options
- auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha));
- auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr;
- auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
- auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0;
- auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
-
- lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0);
-
- if(has_alpha)
- {
- lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha);
- }
- if(has_beta)
- {
- lut["BETA"] = float_to_string_with_full_precision(_desc.beta);
- }
- if(dont_slide_b)
- {
- lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2));
- }
-
- if(reinterpret_output_as_3d)
- {
- lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1));
- lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2));
- }
- else if(reinterpret_input_as_3d)
- {
- lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1));
- lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2));
- }
-
- return lut;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
deleted file mode 100644
index b282856b56..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-
-#include "arm_compute/core/Steps.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClGemmNativeKernelComponent : public IClKernelComponent
-{
-public:
- ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc,
- const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
- : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
- {
- }
-
- ComponentType get_component_type() const override;
- std::set<std::string> get_headers_list() const override;
- std::string get_additional_macros() const override;
- std::string get_component_code() const override;
- Window get_window() const override;
- ClKernelArgList get_args();
- CLBuildOptions generate_build_options() const override;
- std::string generate_config_id() const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _lhs, _rhs, _bias, _dst };
- }
-
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
- virtual std::string name() const override
- {
- return "gemm_mm_native_" + std::to_string(id());
- }
-
-private:
- GemmNativeDescriptor _desc{};
- Link _lhs{};
- Link _rhs{};
- Link _bias{};
- Link _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
index de02f948e9..c6716a0a23 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
@@ -21,16 +21,15 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 5f023ba528..e0b210f4ed 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
@@ -65,25 +67,36 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const
CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
{
auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- auto tile_info = _blueprint->impl().get_tile_info();
+ // auto tile_info = _blueprint->impl().get_tile_info();
CLBuildOptions build_opts{};
+ const auto n0 = _blueprint->impl().get_execution_window().x().step();
+ const auto m0 = _blueprint->impl().get_execution_window().y().step();
+ const auto partial_m0 = t_dst_info->dimension(0) % m0;
+ const auto partial_n0 = t_dst_info->dimension(1) % n0;
+
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0));
return build_opts;
}
-ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+ vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src");
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst");
+}
+
+ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
{
return {
{ "meta_kernel_id", id() },
- { "src", vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src") },
- { "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") },
+ { "src", vtable.get(_src) },
+ { "dst", vtable.get(_dst) },
};
}
@@ -96,19 +109,26 @@ std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() cons
{
return R"_(
//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+ {
+ #define _IDST_WIDTH {{dst}}_w
+ #define _IDST_HEIGHT {{dst}}_h
+ TILE(uint, M0, 1, dst_indirect_y);
- TILE(uint, M0, 1, dst_indirect_y);
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1);
- dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}});
- })
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
- T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y);
+ T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
- //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+ #undef _IDST_WIDTH
+ #undef _IDST_HEIGHT
+ //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+ }
)_";
}
@@ -120,21 +140,24 @@ CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options
return build_opts;
}
-ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+ vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
{
TagLUT lut{};
- lut["meta_kernel_id"] = id();
- lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src");
- lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
+ // Arguments and global shared variables
+ lut["src"] = vtable.get(_src);
+ lut["dst"] = vtable.get(_dst);
// Local build options
- auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- lut["dst_w"] = dst_info->dimension(1);
- lut["dst_h"] = dst_info->dimension(2);
-
+ lut["meta_kernel_id"] = id();
lut["DST_TENSOR_TYPE"] = "BUFFER";
+ const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
lut["DST_DATA_TYPE"] = dst_info->data_type();
return lut;
@@ -142,6 +165,4 @@ ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKern
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index c7da8bd3e8..26883d7fa0 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
@@ -37,21 +39,21 @@ namespace dynamic_fusion
class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
{
public:
- ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+ ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
: IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
{
}
ComponentType get_component_type() const override;
std::string get_component_code() const override;
CLBuildOptions generate_build_options() const override;
+ TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::vector<Link> get_links() const override
{
return { _src, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
virtual std::string name() const override
{
return "";
@@ -65,21 +67,21 @@ private:
class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent
{
public:
- ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+ ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
: IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
{
}
ComponentType get_component_type() const override;
std::string get_component_code() const override;
CLBuildOptions generate_build_options() const override;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::vector<Link> get_links() const override
{
return { _src, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
virtual std::string name() const override
{
return "";
@@ -93,6 +95,4 @@ private:
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H \ No newline at end of file