diff options
author | SiCong Li <sicong.li@arm.com> | 2022-01-28 18:24:39 +0000 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2022-05-06 15:01:45 +0000 |
commit | b63b1196adea8b07dd8db77c2492a212650deba0 (patch) | |
tree | b264035197873f56c69784bec68cad7041b5d423 /src/core/experimental/dynamic_fusion/ClKernelBuildingImpl | |
parent | 3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 (diff) | |
download | ComputeLibrary-b63b1196adea8b07dd8db77c2492a212650deba0.tar.gz |
Integrate Dynamic Fusion patches
* Add public interfaces:
* OperatorGraph: Describe a workload that could contain fused kernels
* IWorkload: Generic interface for workloads built from OperatorGraph
* ClWorkload: OpenCL workloads built from OperatorGraph
* ClCompositeOperator: Runtime async operator to execute a ClWorkload
* DependencyGraph (will likely be deprecated in later iterations)
* Add example
* cl_fused_conv2d_elementwise_add.cpp to explain how to use the new
interfaces
* Add internal translation layer
* Refactor ClKernelBuildingAPI
* Remove non-tile based gemm native kernel component
* Minor interface changes
* Add integration tests
Resolves COMPMID-5161
Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/experimental/dynamic_fusion/ClKernelBuildingImpl')
11 files changed, 532 insertions, 981 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h index aa27572746..17437c285d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H @@ -36,6 +38,7 @@ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" +#include <iostream> #include <queue> #include <stack> #include <string> @@ -63,8 +66,8 @@ enum class SharedVarIO enum class SharedVarGroup { - Argument, // Parameters to a kernel function - Automatic // Automatic variables declared within the kernel body + Argument, // Parameters to a kernel function == dst or src tensors of the whole blueprint graph + Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph }; /** Specifies a shared variable link for a component. @@ -74,85 +77,151 @@ enum class SharedVarGroup */ struct SharedVarLink { - ArgumentID arg_id{ g_arg_placeholder }; - SharedVarIO io{ SharedVarIO::Input }; - SharedVarGroup group{ SharedVarGroup::Argument }; - bool is_empty() const + ArgumentID arg_id{ g_arg_placeholder }; + SharedVarIO io{ SharedVarIO::Input }; + bool is_empty() const { return arg_id == g_arg_placeholder; } }; /** A table of all the variables used in the kernel / blueprint + * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) ** + * * NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID * NOTE: the variables all have the scope of the full kernel function */ class SharedVarTable { public: + /** A fully realized SharedVarLink + */ struct SharedVar { - SharedVarGroup group; - std::string uniq_name; // Unique name, also the final variable name used in the built code - ClKernelArgRuntimeDescriptor desc; // Automatic variables can and should still be described using this struct + ArgumentID arg_id{ g_arg_placeholder }; + SharedVarIO io{ SharedVarIO::Input }; + SharedVarGroup group{ SharedVarGroup::Argument }; + std::string uniq_name{}; // Unique name, also the final variable name used in the built code + ClKernelArgDescriptor desc{}; // Automatic variables can and should still be described using this struct + bool is_empty() const + { + return arg_id == g_arg_placeholder; + } }; - using Arguments = std::vector<SharedVar>; + class Arguments + { + public: + Arguments() = default; + void add_var(const SharedVar &var) + { + ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument); + _vars.push_back(var); + } + std::vector<SharedVar> get_all_vars() const + { + return _vars; + } + std::vector<SharedVar> get_src_vars() const + { + std::vector<SharedVar> src_vars; + std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var) + { + return var.io == SharedVarIO::Input; + }); + return src_vars; + } + SharedVar get_dst_var() const + { + std::vector<SharedVar> dst_vars; + std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var) + { + return var.io == SharedVarIO::Output; + }); + ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1); + return dst_vars.at(0); + } + + private: + std::vector<SharedVar> _vars{}; + }; - /** @note: The order of insertion is important. There is one precondition: + /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one + * + * @note: The order of insertion is important. There is one precondition: * PRECOND: The components have been sorted topologically / is being traversed in topological order * This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred */ - SharedVar add(SharedVarLink var_link, ClKernelArgRuntimeDescriptor runtime_desc, const std::string &name = "unnamed") + void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed") { ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected"); + if(!get(var_link).is_empty()) + { + return; + } + auto var_id = _num_var; std::stringstream ss; ss << name << "_" << var_id; const auto uniq_name = ss.str(); - SharedVar var{ var_link.group, uniq_name, runtime_desc }; + SharedVar var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc }; - if(var_link.group == SharedVarGroup::Argument) + if(group == SharedVarGroup::Argument) { _arguments.emplace(var_id, var); + _arg_id_map.emplace(var_link.arg_id, var_id); _num_var++; - _var_id_lut[var_link.arg_id] = var_id; } - else if(var_link.group == SharedVarGroup::Automatic) + else if(group == SharedVarGroup::Automatic) { - if(var_link.io == SharedVarIO::Output) + if(_global_vars.empty()) { - _global_vars.emplace(var_id, var); - _num_var++; - _var_id_lut[var_link.arg_id] = var_id; + if(var_link.io == SharedVarIO::Output) + { + _global_vars.emplace(var_id, var); + _arg_id_map.emplace(var_link.arg_id, var_id); + _num_var++; + } + else + { + ARM_COMPUTE_ERROR("Component likely not traversed in topological order"); + } } else { - // For the input link, the var (and thus its arg_id) will always have been added by the time we get here if we traverse components in topological order - var = get_var(var_link.arg_id); + // Associate additional SharedVarLinks with the single global shared variable + const auto global_var_id = _global_vars.begin()->first; + _arg_id_map[var_link.arg_id] = global_var_id; } } else { ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup"); } - return var; } - SharedVar get_var(ArgumentID arg_id) const + /** Get the SharedVar associated with @p var_link + * + * @param var_link + * @return SharedVar + */ + SharedVar get(const SharedVarLink &var_link) const { - const auto var_id = _var_id_lut.at(arg_id); // arg_id has to exist in lut to begin with - auto it = _global_vars.find(var_id); - if(it != _global_vars.end()) - { - return it->second; - } - it = _arguments.find(var_id); - if(it != _arguments.end()) + const SharedVar empty_var{}; + if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end()) { - return it->second; + const auto var_id = _arg_id_map.at(var_link.arg_id); + const auto arg_var = _arguments.find(var_id); + if(arg_var != _arguments.end()) + { + return arg_var->second; + } + else + { + return _global_vars.at(var_id); + } } - ARM_COMPUTE_ERROR("Cannot find component variable"); + return empty_var; } /** @note The arguments are returned in the order they are added @@ -162,7 +231,7 @@ public: Arguments args{}; for(const auto &a : _arguments) { - args.push_back(a.second); + args.add_var(a.second); } return args; } @@ -171,9 +240,9 @@ private: using VarID = int32_t; private: - std::map<VarID, SharedVar> _global_vars{}; - std::map<VarID, SharedVar> _arguments{}; - std::unordered_map<ArgumentID, VarID> _var_id_lut{}; + std::map<VarID, SharedVar> _global_vars{}; // Shared, global variable + std::map<VarID, SharedVar> _arguments{}; + std::map<ArgumentID, VarID> _arg_id_map{}; // Track ArgumentIDs that have already been added VarID _num_var{ 0 }; }; @@ -184,7 +253,7 @@ enum class ComponentType Store }; -using ComponentID = int32_t; +using ComponentID = DependencyGraph::Id; using ComponentList = std::vector<ComponentID>; class IClKernelComponent { @@ -224,7 +293,7 @@ public: }; using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags public: - IClKernelComponent(const ClKernelBlueprint *blueprint) + IClKernelComponent(ClKernelBlueprint *blueprint) : _blueprint(blueprint) { } @@ -304,12 +373,18 @@ public: { return Window{}; } - /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code + /** Get the tag look-up table used to instantiate the component code. * * @param vtable * @return TagLUT */ - virtual TagLUT allocate_vars(SharedVarTable &vtable) const = 0; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0; + + /** Allocate all shared variables used by the component in the @p vtable + * + * @param vtable + */ + virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0; virtual std::string get_dst_addr_calculation() const { @@ -331,7 +406,7 @@ public: } protected: - const ClKernelBlueprint *_blueprint; + ClKernelBlueprint *_blueprint; private: ComponentID _id{}; @@ -348,18 +423,19 @@ public: ~Implementation() = default; public: - ArgumentID add_kernel_argument(const ClTensorDescriptor &tensor_desc) + Status update_merge_point(ArgumentID t_id, ArgumentID merge_point) { - _kernel_arguments.insert(std::make_pair(_num_args, tensor_desc)); - _shared_var_group_lut[_num_args] = SharedVarGroup::Argument; - return _num_args++; + return _graph.update_merge_point(t_id, merge_point); } - ArgumentID add_intermediate_tensor() + ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id()) { - _intermediate_tensors.insert(_num_args); - _shared_var_group_lut[_num_args] = SharedVarGroup::Automatic; - return _num_args++; + const auto id = _graph.add_tensor(merge_point); + if(_kernel_tensors.find(id) == _kernel_tensors.end()) + { + _kernel_tensors.insert(std::make_pair(id, tensor_info)); + } + return id; } void set_tile_info(const TileDescriptor &tile_info) @@ -382,7 +458,7 @@ public: for(const auto arg_id : args) { ARM_COMPUTE_UNUSED(arg_id); - ARM_COMPUTE_ERROR_ON_MSG(_kernel_arguments.find(arg_id) == _kernel_arguments.end() && _intermediate_tensors.find(arg_id) == _intermediate_tensors.end() && arg_id != g_arg_placeholder, + ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder, "Trying to use an argument that hasn't been added to the blueprint"); } } @@ -395,29 +471,36 @@ public: ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported."); } - // This flag specifies if the current component is the root of the component graph - // If the root is set to -1, it means that a root hasn't been added yet - bool is_graph_root = true; - // Get an unique ID for the component that's being added - const ComponentID component_id = _num_components++; + std::vector<ArgumentID> src_tensors; + std::vector<ArgumentID> dst_tensors; + for(const auto &link : component->get_links()) + { + if(link.is_empty()) + { + continue; + } + if(link.io == SharedVarIO::Input) + { + src_tensors.push_back(link.arg_id); + } + else + { + dst_tensors.push_back(link.arg_id); + } + } + const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second; component->set_id(component_id); // Add this component to the component graph. Don't connect it to anything yet _component_graph.emplace(component_id, ComponentList{}); - int32_t positional_arg = 0; - // For every { arg_id, arg_io } passed along with this component... for(const auto &link : component->get_links()) { const ArgumentID &arg_id = link.arg_id; const SharedVarIO &arg_io = link.io; - // A component is considered root only if all its input arguments are kernel arguments (or placeholders, which means nullptr) - // This performs a check on every argument, and if one of them doesn't respect the condition, the component is not considered root - is_graph_root &= (_kernel_arguments.find(arg_id) != _kernel_arguments.end()) || (arg_io == SharedVarIO::Output) || (arg_id == g_arg_placeholder); - // Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there if(_outgoing_components.find(arg_id) == _outgoing_components.end()) { @@ -454,15 +537,9 @@ public: _incoming_components[arg_id].push_back(component_id); } - - ++positional_arg; } - if(is_graph_root) - { - ARM_COMPUTE_ERROR_ON_MSG(_graph_root >= 0, "Trying to add more than one root to the graph"); - _graph_root = component_id; - } + ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph"); // Finally, add this component to the dictionary of components _components.insert(std::make_pair(component_id, std::move(component))); @@ -489,17 +566,28 @@ public: std::set<std::string> additional_macros{}; std::vector<std::string> component_codes{}; // vector because order matters - // Go through the components graph (topological sort) and fill the data structures above + // Step 1: Allocate all kernel argument shared variables before generating the component code auto stack = topological_sort(); while(!stack.empty()) { auto curr_component_id = stack.top(); auto &curr_component = _components.find(curr_component_id)->second; + curr_component->allocate_shared_vars(_vtable); + + stack.pop(); + } + // Step 2: Generate component codes + stack = topological_sort(); + while(!stack.empty()) + { + auto curr_component_id = stack.top(); + auto &curr_component = _components.find(curr_component_id)->second; + auto curr_headers_list = curr_component->get_headers_list(); auto curr_additional_macros = curr_component->get_additional_macros(); auto curr_component_code = curr_component->get_component_code(); - const auto var_lut = curr_component->allocate_vars(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + const auto var_lut = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut)); headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); @@ -511,7 +599,7 @@ public: stack.pop(); } - // This section assembles the data gathered by traversing the graph into the string "code" + // Step 3: Assemble the data gathered by traversing the graph into the string "code" std::string code = ""; for(auto &header : headers_list) @@ -596,34 +684,79 @@ public: ClKernelArgList get_arguments() const { ClKernelArgList arg_list{}; - for(const auto &arg_var : _vtable.get_kernel_arguments()) + for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars()) { - arg_list.push_back(arg_var.desc); + arg_list[arg_var.desc.arg_id] = arg_var.desc; } return arg_list; } - const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const + /** Get the arguments as shared vars from the vtable + * + * @return SharedVarTable::Arguments + */ + SharedVarTable::Arguments get_argument_shared_vars() const + { + return _vtable.get_kernel_arguments(); + } + + const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const { - auto it = _kernel_arguments.find(id); - if(it != _kernel_arguments.end()) + auto it = _kernel_tensors.find(id); + if(it != _kernel_tensors.end()) { - return &_kernel_arguments.find(id)->second; + return it->second; } return nullptr; } - ITensorInfo *get_kernel_argument_info(const ArgumentID id) const + ITensorInfo *get_kernel_argument_info(const ArgumentID id) { - const ClTensorDescriptor *arg_desc = get_kernel_argument(id); - if(arg_desc != nullptr) + auto it = _kernel_tensors.find(id); + if(it != _kernel_tensors.end()) { - return arg_desc->tensor_info; + return it->second; } return nullptr; } + /** Finalize graph construction. Graph is expected to not mutate after being finalized + */ + void finalize() + { + cache_root_component(); + assign_shared_var_group(); + } + + DependencyGraph get_graph() const + { + return _graph; + } private: + void cache_root_component() + { + const auto roots = _graph.get_root_ops(); + ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph"); + _graph_root = roots.at(0); + } + /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building + */ + void assign_shared_var_group() + { + for(const auto &tensor : _kernel_tensors) + { + const auto tensor_id = tensor.first; + if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id)) + { + _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument; + } + else + { + _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic; + } + } + } + void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const { visited.insert(component_id); @@ -666,41 +799,41 @@ private: std::string code; switch(var.desc.tensor_arg_type) { - case TensorArgType::Vector: + case ClKernelTensorArgType::Vector: { code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")"; break; } - case TensorArgType::Image: + case ClKernelTensorArgType::Image: { code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")"; break; } - case TensorArgType::Image_3D: + case ClKernelTensorArgType::Image_3D: { code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),"; code += "\n uint " + var.uniq_name + "_stride_z"; break; } - case TensorArgType::Image_3D_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: { code += "\n __read_only image2d_t " + var.uniq_name + "_img,"; code += "\n uint " + var.uniq_name + "_stride_z"; break; } - case TensorArgType::Tensor_4D_t_Buffer: + case ClKernelTensorArgType::Tensor_4D_t_Buffer: { code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)"; break; } - case TensorArgType::Tensor_4D_t_Image: + case ClKernelTensorArgType::Tensor_4D_t_Image: { code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)"; break; } default: { - ARM_COMPUTE_ERROR("Unsupported declaration generation for TensorArgType"); + ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType"); } } return code; @@ -710,7 +843,7 @@ private: { std::string code = "\n__kernel void " + build_kernel_name() + "("; - for(const auto &arg : argument_list) + for(const auto &arg : argument_list.get_all_vars()) { code += generate_argument_declaration(arg) + ","; } @@ -722,54 +855,55 @@ private: std::string generate_global_section() const { - std::string code = ""; - code += " uint g_x = get_global_id(0);\n"; - code += " uint g_y = get_global_id(1);\n"; - code += " uint g_z = get_global_id(2);\n\n"; + auto dst_info = get_kernel_argument_info(_dst_id); + auto dst_w = dst_info->dimension(0); + auto dst_h = dst_info->dimension(1); + const auto tile_w = std::max(1, get_execution_window().x().step()); + const auto tile_h = std::max(1, get_execution_window().y().step()); + auto leftover_w = dst_w % tile_w; + auto leftover_h = dst_h % tile_h; - size_t tile_dim_x = _tile_info.empty() ? 1 : _tile_info.tile_dims.x(); - size_t tile_dim_y = _tile_info.empty() ? 1 : _tile_info.tile_dims.y(); + std::string code = ""; + code += std::string(" int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + std::to_string(leftover_h) + ");\n"; + code += std::string(" int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n"); switch(_tile_info.clipping) { case ClippingStrategy::TOP_LEFT: - code += " const bool g_cond_x = (g_x == 0);\n"; - code += " const bool g_cond_y = (g_y == 0);\n"; + code += " const bool g_cond_x = (cout == 0);\n"; + code += " const bool g_cond_y = (mout == 0);\n"; break; case ClippingStrategy::TOP_RIGHT: - code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; - code += " const bool g_cond_y = (g_y == 0);\n"; + code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; + code += " const bool g_cond_y = (mout == 0);\n"; break; case ClippingStrategy::BOTTOM_LEFT: - code += " const bool g_cond_x = (g_x == 0);\n"; - code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; + code += " const bool g_cond_x = (cout == 0);\n"; + code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; break; case ClippingStrategy::BOTTOM_RIGHT: - code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; - code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; + code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; + code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; break; default: ARM_COMPUTE_ERROR("Unsupported clipping strategy"); } - code += "\n REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n"; - code += " REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n"; - return code; } TileDescriptor _tile_info{}; - int32_t _num_args{}; - int32_t _num_components{}; int32_t _num_complex_components{}; ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers - // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate) + DependencyGraph _graph{}; + + // Tensors, components and IDs with corresponding ptrs (except intermediate) std::unordered_map<ComponentID, ComponentUniquePtr> _components{}; - std::unordered_map<ArgumentID, ClTensorDescriptor> _kernel_arguments{}; - std::unordered_set<ArgumentID> _intermediate_tensors{}; + std::unordered_map<ArgumentID, ITensorInfo *> _kernel_tensors{}; // Argument group lookup. Can be replaced by extending the ArgumentID type to include group info std::unordered_map<ArgumentID, SharedVarGroup> _shared_var_group_lut{}; @@ -794,6 +928,4 @@ private: } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h index 41ab4e320b..d4feac7da9 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS @@ -72,6 +74,4 @@ inline std::string to_string(const ClKernelCode &code) } // namespace experimental } // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp index f951ce3d46..11fb1d53d0 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" @@ -31,6 +33,7 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" namespace arm_compute { namespace experimental @@ -44,7 +47,7 @@ ComponentType ClDirectConvolutionKernelComponent::get_component_type() const std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const { - return std::set<std::string> { "helpers.h", "tile_helpers.h", "repeat.h" }; + return std::set<std::string> { "helpers.h", "tile_helpers.h" }; } Window ClDirectConvolutionKernelComponent::get_window() const @@ -54,7 +57,17 @@ Window ClDirectConvolutionKernelComponent::get_window() const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info); + PadStrideInfo pad_stride_info + { + static_cast<unsigned int>(_desc.conv2d.stride.x()), + static_cast<unsigned int>(_desc.conv2d.stride.y()), + static_cast<unsigned int>(_desc.conv2d.pad.left), + static_cast<unsigned int>(_desc.conv2d.pad.right), + static_cast<unsigned int>(_desc.conv2d.pad.top), + static_cast<unsigned int>(_desc.conv2d.pad.bottom), + DimensionRoundingType::FLOOR /*default rounding type*/ + }; + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info); // Output auto initialization if not yet initialized auto_init_if_empty(*dst_info, output_shape, @@ -64,6 +77,9 @@ Window ClDirectConvolutionKernelComponent::get_window() const const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u); const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U; + // const unsigned int num_rows = 1; + // const unsigned int vec_size = tile_info.tile_dims.x(); + // const unsigned int num_rows = tile_info.tile_dims.y(); // Create and configure kernel window Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); @@ -95,27 +111,30 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const //------------------ START KERNEL {{meta_kernel_id}} --------------------- // IN_0(src) {{src}} // IN_1(wei) {{weight}} + )_"; + if(bias_info != nullptr) + { + code += R"_( // IN_1(bia) {{bias}} + )_"; + } + code += R"_( // OUT(dst, accum) {{dst}} - const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM - const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - // Initialize the accumulators TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); { // All the tensor dimensions are passed at compile time. // In case of dynamic tensor support, the following dimensions should be passed as function argument. - #define _I{{WEI_WIDTH}} {{WEI_WIDTH}} - #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}} + #define _IWEI_WIDTH {{WEI_WIDTH}} + #define _IWEI_HEIGHT {{WEI_HEIGHT}} #define _ISRC_WIDTH {{src}}_w #define _ISRC_HEIGHT {{src}}_h #define _ISRC_CHANNELS {{src}}_c - #define _IDST_WIDTH {{dst_w}} - #define _IDST_HEIGHT {{dst_h}} - #define _IDST_CHANNELS {{dst_c}} - #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}) + #define _IDST_WIDTH {{arg_dst}}_w + #define _IDST_HEIGHT {{arg_dst}}_h + #define _IDST_CHANNELS {{arg_dst}}_c + #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // .v = access the whole vector (OpenCL vector) // .s[x] = access the vector element at position x (scalar access) @@ -136,13 +155,11 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const {{dst}}[i].v = 0; }) - uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0); - - for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i) + for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { int ck = 0; - int xk = i % _I{{WEI_WIDTH}}; - int yk = i / _I{{WEI_WIDTH}}; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_HEIGHT; int k = 0; for(; k <= (_ISRC_CHANNELS - K0); k += K0) @@ -201,6 +218,16 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const } code += R"_( + #undef _I_WEI_WIDTH + #undef _I_WEI_HEIGHT + #undef _ISRC_WIDTH + #undef _ISRC_HEIGHT + #undef _ISRC_CHANNELS + #undef _IDST_WIDTH + #undef _IDST_HEIGHT + #undef _IDST_CHANNELS + #undef _IY_MULTIPLIER + } )_"; @@ -217,44 +244,7 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const } code += R"_( - #undef _I{{WEI_WIDTH}} - #undef _I{{WEI_HEIGHT}} - #undef _ISRC_WIDTH - #undef _ISRC_HEIGHT - #undef _ISRC_CHANNELS - #undef _IDST_WIDTH - #undef _IDST_HEIGHT - #undef _IDST_CHANNELS - #undef _IY_MULTIPLIER } - - // Workaround for the discrepancy between tiles and repeats - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v; -#if M0 >= 2 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v; -#endif // M0 >= 2 -#if M0 >= 3 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v; -#endif // M0 >= 3 -#if M0 >= 4 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v; -#endif // M0 >= 4 -#if M0 >= 8 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v; -#endif // M0 >= 8 -#if M0 == 16 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v; -#endif // M0 == 16 //------------------ END KERNEL {{meta_kernel_id}} --------------------- )_"; return code.c_str(); @@ -306,19 +296,18 @@ bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const { const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + // const auto tile_info = _blueprint->impl().get_tile_info(); const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); const DataType data_type = src_info->data_type(); - const GPUTarget gpu_target = ICLKernel().get_target(); - - Window win = get_window(); + const GPUTarget gpu_target = CLScheduler::get().target(); - const unsigned int n0 = win.x().step(); - const unsigned int m0 = win.y().step(); + const unsigned int n0 = _blueprint->impl().get_execution_window().x().step(); + const unsigned int m0 = _blueprint->impl().get_execution_window().y().step(); const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0; + const unsigned int partial_store_n0 = dst_info->dimension(0) % n0; const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); // Update the padding for the weights tensor if we can export to cl_image @@ -338,54 +327,79 @@ CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() cons return build_opts; } -ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); + + const GPUTarget gpu_target = CLScheduler::get().target(); + const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); + const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer; + vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight"); + + if(!_bias.is_empty()) // optional bias + { + vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias"); + } + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); +} + +ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { TagLUT lut{}; const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - const GPUTarget gpu_target = ICLKernel().get_target(); - const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); - const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer; - lut["meta_kernel_id"] = id(); - lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src"); - lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight"); + // Arguments and global shared variables + lut["src"] = vtable.get(_src); + lut["weight"] = vtable.get(_weight); if(!_bias.is_empty()) // optional bias { - lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias"); + lut["bias"] = vtable.get(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type()); } - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); - - // Local build options - const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); - const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); - const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); + lut["dst"] = vtable.get(_dst); - lut["dst_w"] = dst_info->dimension(width_idx); - lut["dst_h"] = dst_info->dimension(height_idx); - lut["dst_c"] = dst_info->dimension(channel_idx); + const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var(); + lut["arg_dst"] = dst_argument.uniq_name; - lut["ACC_DATA_TYPE"] = src_info->data_type(); - lut["SRC_DATA_TYPE"] = src_info->data_type(); - lut["WEI_DATA_TYPE"] = weight_info->data_type(); + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = src_info->data_type(); + lut["SRC_DATA_TYPE"] = src_info->data_type(); + lut["WEI_DATA_TYPE"] = weight_info->data_type(); lut["SRC_TENSOR_TYPE"] = "BUFFER"; - lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER"; - - lut["WEI_WIDTH"] = weight_info->dimension(width_idx); - lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); + switch(vtable.get(_weight).desc.tensor_arg_type) + { + case ClKernelTensorArgType::Image_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: + case ClKernelTensorArgType::Tensor_4D_t_Image: + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } + default: + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + } + const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); + const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); + lut["WEI_WIDTH"] = weight_info->dimension(width_idx); + lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); - lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride()); - lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride()); + lut["STRIDE_X"] = _desc.conv2d.stride.x(); + lut["STRIDE_Y"] = _desc.conv2d.stride.y(); - lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left(); - lut["PAD_TOP"] = _desc.pad_stride_info.pad_top(); + lut["PAD_LEFT"] = _desc.conv2d.pad.left; + lut["PAD_TOP"] = _desc.conv2d.pad.top; lut["ZERO_VALUE"] = 0; @@ -393,6 +407,4 @@ ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::a } } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h index 10c0e00a58..af9a65debc 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H @@ -39,7 +41,7 @@ namespace dynamic_fusion class ClDirectConvolutionKernelComponent : public IClKernelComponent { public: - ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc, + ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc, const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{}) : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst } { @@ -58,7 +60,8 @@ public: return { _src, _weight, _bias, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + virtual void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::string name() const override { @@ -66,16 +69,14 @@ public: } private: - DirectConvolutionDescriptor _desc{}; - Link _src{}; - Link _weight{}; - Link _bias{}; - Link _dst{}; + ClDirectConv2dKernelDescriptor _desc{}; + Link _src{}; + Link _weight{}; + Link _bias{}; + Link _dst{}; }; } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp index 84e4003d5d..2bbea8725d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h" #include "arm_compute/core/Validate.h" @@ -41,7 +43,7 @@ ComponentType ClElementwiseAddKernelComponent::get_component_type() const std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const { - return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" }; + return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" }; } Window ClElementwiseAddKernelComponent::get_window() const @@ -67,63 +69,62 @@ Window ClElementwiseAddKernelComponent::get_window() const std::string ClElementwiseAddKernelComponent::get_component_code() const { std::string code; - return R"_( + const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; + + if(is_root) + { + return R"_( //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- - // IN_0(Accumulator) {{acc}} - // IN_1(Addend) {{addend}} + // IN_0(LHS) {{lhs}} + // IN_1(RHS) {{rhs}} + // OUT(dst, accum) {{dst}} - // c = addend + c (mix-precision, broadcast, boundary aware) + // dst = lhs + rhs (mix-precision, broadcast, boundary aware) + TILE({{DATA_TYPE}}, M0, N0, {{dst}}); { - __global uchar *addend_addr = {{addend}}_ptr + {{addend}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{addend}}_stride_y) + get_global_id(2) * {{addend}}_stride_z; \ - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \ - MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp); - } + TILE({{DATA_TYPE}}, M0, N0, lhs_tile); + TILE({{DATA_TYPE}}, M0, N0, rhs_tile); - // Workaround for the discrepancy between tiles and repeats -#if defined(IS_TILED) - {{acc}}[0].v = {{acc}}0; -#if M0 >= 2 - {{acc}}[1].v = {{acc}}1; -#endif // M0 >= 2 -#if M0 >= 3 - {{acc}}[2].v = {{acc}}2; -#endif // M0 >= 3 -#if M0 >= 4 - {{acc}}[3].v = {{acc}}3; -#endif // M0 >= 4 -#if M0 >= 8 - {{acc}}[4].v = {{acc}}4; - {{acc}}[5].v = {{acc}}5; - {{acc}}[6].v = {{acc}}6; - {{acc}}[7].v = {{acc}}7; -#endif // M0 >= 8 -#if M0 == 16 - {{acc}}[8].v = {{acc}}8; - {{acc}}[9].v = {{acc}}9; - {{acc}}[10].v = {{acc}}A; - {{acc}}[11].v = {{acc}}B; - {{acc}}[12].v = {{acc}}C; - {{acc}}[13].v = {{acc}}D; - {{acc}}[14].v = {{acc}}E; - {{acc}}[15].v = {{acc}}F; -#endif // M0 == 16 -#endif // defined(IS_TILED) + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile); + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile); + + T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); + } //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- +)_"; + } + else + { + return R"_( + //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- + // IN_0/Out(Accumulator) {{acc}} + // IN_1(Addend) {{addend}} + // acc = addend + acc (mix-precision, broadcast, boundary aware) + { + TILE({{DATA_TYPE}}, M0, N0, addend_tile); + + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile); + + T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}}); + } + //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- )_"; + } } CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const { - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - auto tile_info = _blueprint->impl().get_tile_info(); + const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); CLBuildOptions build_opts{}; + const auto n0 = _blueprint->impl().get_execution_window().x().step(); + const auto m0 = _blueprint->impl().get_execution_window().y().step(); + const auto partial_m0 = t_dst_info->dimension(1) % m0; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); - build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0)); return build_opts; } @@ -142,34 +143,56 @@ std::string ClElementwiseAddKernelComponent::generate_config_id() const return config_id; } -ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const { - // Determine which argument is the accumulator - Link accumulator; - Link addend; - if(_lhs.group == SharedVarGroup::Automatic) + const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; + vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs"); + vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs"); + if(is_root) { - accumulator = _lhs; - addend = _rhs; + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); } - else if(_rhs.group == SharedVarGroup::Automatic) +} + +ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const +{ + TagLUT lut{}; + const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + // Arguments and global shared variables + const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; + if(is_root) { - accumulator = _rhs; - addend = _lhs; + lut["lhs"] = vtable.get(_lhs); + lut["rhs"] = vtable.get(_rhs); + lut["dst"] = vtable.get(_dst); } else { - ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + // Determine which link is the accumulator + Link accumulator; + Link addend; + if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic) + { + accumulator = _lhs; + addend = _rhs; + } + else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic) + { + accumulator = _rhs; + addend = _lhs; + } + else + { + ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + } + lut["acc"] = vtable.get(accumulator); + lut["addend"] = vtable.get(addend); } - return { - { "meta_kernel_id", id() }, - { "acc", vtable.add(accumulator, ClKernelArgRuntimeDescriptor(accumulator.arg_id, TensorArgType::Image_3D), "add_acc") }, - { "addend", vtable.add(addend, ClKernelArgRuntimeDescriptor(addend.arg_id, TensorArgType::Image_3D), "add_addend") }, - // {"dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst")}, // dst is needed for the root version and/or non-inplace version should we need one - }; + // Local build options + lut["meta_kernel_id"] = id(); + lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type()); + return lut; } } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h index 35c9538b8d..4f7b69724d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H @@ -37,7 +39,7 @@ namespace dynamic_fusion class ClElementwiseAddKernelComponent : public IClKernelComponent { public: - ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst) + ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst) : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst } { } @@ -54,7 +56,8 @@ public: return { _lhs, _rhs, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + virtual void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::string name() const override { @@ -70,6 +73,4 @@ private: } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp deleted file mode 100644 index 45b81b424d..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp +++ /dev/null @@ -1,555 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/utils/helpers/float_ops.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ComponentType ClGemmNativeKernelComponent::get_component_type() const -{ - return ComponentType::Complex; -} - -std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const -{ - return std::set<std::string> { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" }; -} - -Window ClGemmNativeKernelComponent::get_window() const -{ - ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info); - - bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0; - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation - GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d, - _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width, - _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset); - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info))); - - TensorInfo tmp_info(*dst_info); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst_info->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); - win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); - - AccessWindowStatic src0_access(lhs_info, 0, 0, - lhs_info->dimension(0), - lhs_info->dimension(1)); - AccessWindowStatic src1_access(rhs_info, 0, 0, - ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0), - rhs_info->dimension(1)); - AccessWindowStatic dst_access(dst_info, 0, 0, - dst_info->dimension(0), - dst_info->dimension(1)); - - if(bias_info != nullptr) - { - const int bias_processed_per_iteration_x = _desc.rhs_info.n0; - - AccessWindowStatic src2_access(bias_info, 0, 0, - ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x), - bias_info->dimension(1)); - - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - else - { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - if(window_changed == true) - { - ARM_COMPUTE_ERROR("Insufficient Padding!"); - } - - return collapsed; -} - -std::string ClGemmNativeKernelComponent::get_additional_macros() const -{ - return R"_( -#define VFMA(a, b, c) \ -({ \ - c = fma(a, b, c); \ -}) - -#if M0 == 1 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - }) -#elif M0 == 2 // M0 == 2 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - }) -#elif M0 == 3 // M0 == 3 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - }) -#elif M0 == 4 // M0 == 4 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - }) -#elif M0 == 5 // M0 == 5 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - }) -#elif M0 == 6 // M0 == 6 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - }) -#elif M0 == 7 // M0 == 7 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - }) -#elif M0 == 8 // M0 == 8 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ - }) -#else // M0 not supported -#error "M0 not supported" -#endif // M0 not supported -)_"; -} - -std::string ClGemmNativeKernelComponent::get_component_code() const -{ - auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - - auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); - auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; - auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); - - std::string code = R"_( - //------------------ START KERNEL {{meta_kernel_id}} --------------------- - // IN_0(lhs) {{lhs}} - // IN_1(rhs) {{rhs}} - )_"; - - if(!_bias.is_empty()) - { - code += R"_( - // IN_2(bias) {{bias}} - )_"; - } - - code += R"_( - // OUT(dst, accum) {{dst}} - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), {{dst}}, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; - { -#if defined(DUMMY_WORK_ITEMS) - if((g_x * N0 >= N) || (g_y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = {{lhs}}_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * (uint){{lhs}}_stride_y; - - // Compute RHS matrix address - uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE); - )_"; - - if(dont_slide_b) - { - code += R"_( - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z; - )_"; - } - else - { - code += R"_( - rhs_offset += g_z * {{rhs}}_stride_z; - )_"; - } - - code += R"_( - REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); - )_"; - - if(reinterpret_input_as_3d) - { - code += R"_( - // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}}; - )_"; - } - else - { - code += R"_( - // Add offset for batched GEMM - lhs_offset += g_z * {{lhs}}_stride_z; - )_"; - } - - code += R"_( - int i = 0; -#if {{K0}} > 1 - for(; i <= (K - {{K0}}); i += {{K0}}) - { - // Supported cases (M0, K0): - // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 - // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 - // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 - // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 - // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 - // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 - // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 - // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 - // Load values from LHS matrix - LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs); - - // Load values from RHS matrix - LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero); - - RHS_VFMA_M0xN0(0, a, b0, {{dst}}); - RHS_VFMA_M0xN0(1, a, b1, {{dst}}); -#if {{K0}} > 2 - RHS_VFMA_M0xN0(2, a, b2, {{dst}}); -#endif // K0 > 2 -#if {{K0}} > 3 - RHS_VFMA_M0xN0(3, a, b3, {{dst}}); -#endif // K0 > 3 -#if {{K0}} > 4 - RHS_VFMA_M0xN0(4, a, b4, {{dst}}); - RHS_VFMA_M0xN0(5, a, b5, {{dst}}); - RHS_VFMA_M0xN0(6, a, b6, {{dst}}); - RHS_VFMA_M0xN0(7, a, b7, {{dst}}); -#endif // K0 > 4 -#if {{K0}} > 8 - RHS_VFMA_M0xN0(8, a, b8, {{dst}}); - RHS_VFMA_M0xN0(9, a, b9, {{dst}}); - RHS_VFMA_M0xN0(A, a, bA, {{dst}}); - RHS_VFMA_M0xN0(B, a, bB, {{dst}}); - RHS_VFMA_M0xN0(C, a, bC, {{dst}}); - RHS_VFMA_M0xN0(D, a, bD, {{dst}}); - RHS_VFMA_M0xN0(E, a, bE, {{dst}}); - RHS_VFMA_M0xN0(F, a, bF, {{dst}}); -#endif // K0 > 8 - - lhs_offset += {{K0}} * sizeof(DATA_TYPE); - rhs_offset += {{K0}} * {{rhs}}_stride_y; - } -#endif // K0 > 1 - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - VEC_DATA_TYPE(DATA_TYPE, 2) - a0 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 0 * {{lhs}}_stride_y + zlhs0)); -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, 2) - a1 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 1 * {{lhs}}_stride_y + zlhs1)); -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, 2) - a2 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 2 * {{lhs}}_stride_y + zlhs2)); -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, 2) - a3 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 3 * {{lhs}}_stride_y + zlhs3)); -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, 2) - a4 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 4 * {{lhs}}_stride_y + zlhs4)); -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, 2) - a5 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 5 * {{lhs}}_stride_y + zlhs5)); -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, 2) - a6 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 6 * {{lhs}}_stride_y + zlhs6)); -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, 2) - a7 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 7 * {{lhs}}_stride_y + zlhs7)); -#endif // M0 > 7 - - VEC_DATA_TYPE(DATA_TYPE, N0) - b = VLOAD(N0)(0, (__global DATA_TYPE *)({{rhs}}_ptr + rhs_offset + 0 * {{rhs}}_stride_y)); - RHS_VFMA_M0xN0(0, a, b, {{dst}}); - - lhs_offset += sizeof(DATA_TYPE); - rhs_offset += {{rhs}}_stride_y; - } - - // Multiply by the weight of matrix-matrix product and store the result - )_"; - if(has_alpha) - { - code += R"_( - SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}}); - )_"; - } - - if(!_bias.is_empty()) - { - if(_desc.broadcast_bias) - { - code += R"_( - // Add beta*bias - __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); - )_"; - - if(helpers::float_ops::is_one(_desc.beta)) - { - code += R"_( - SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}}); - )_"; - } - - code += R"_( - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0); - )_"; - } - else - { - code += R"_( - // Add beta*bias - __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, - PARTIAL_STORE_M0) - * {{bias}}_stride_y) - + g_z * {{bias}}_stride_z; - - LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); - )_"; - - if(helpers::float_ops::is_one(_desc.beta)) - { - code += R"_( - SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}}); - )_"; - } - - code += R"_( - // c = c + bias - ADD_BLOCK(M0, {{dst}}, bias); - )_"; - } - } - - code += R"_( - } - //------------------ END KERNEL {{meta_kernel_id}} --------------------- - )_"; - return code.c_str(); -} - -CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - auto tile_info = _blueprint->impl().get_tile_info(); - - CLBuildOptions build_opts{}; - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y())); - build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x())); - build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); - build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); - - return build_opts; -} - -std::string ClGemmNativeKernelComponent::generate_config_id() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - std::string config_id{}; - config_id += (_bias.is_empty() ? "add_bias_" : ""); - config_id += (_desc.broadcast_bias ? "broadcast_bias_" : ""); - config_id += (_desc.reinterpret_input_as_3d ? "3di_" : ""); - config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : ""); - config_id += lower_string(string_from_data_type(t_dst_info->data_type())); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(1)); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(0)); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.k); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(2)); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.lhs_info.m0); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.rhs_info.n0); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.rhs_info.k0); - return config_id; -} - -ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const -{ - TagLUT lut{}; - - lut["meta_kernel_id"] = id(); - lut["lhs"] = vtable.add(_lhs, ClKernelArgRuntimeDescriptor(_lhs.arg_id, TensorArgType::Image_3D), "lhs"); - lut["rhs"] = vtable.add(_rhs, ClKernelArgRuntimeDescriptor(_rhs.arg_id, TensorArgType::Image_3D), "rhs"); - if(!_bias.is_empty()) // optional bias - { - lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias"); - } - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst"); - - // Local build options - auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); - auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr; - auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; - auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0; - auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); - - lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0); - - if(has_alpha) - { - lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha); - } - if(has_beta) - { - lut["BETA"] = float_to_string_with_full_precision(_desc.beta); - } - if(dont_slide_b) - { - lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2)); - } - - if(reinterpret_output_as_3d) - { - lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1)); - lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2)); - } - else if(reinterpret_input_as_3d) - { - lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1)); - lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2)); - } - - return lut; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h deleted file mode 100644 index b282856b56..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H - -#include "arm_compute/core/Steps.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" -#include "src/core/helpers/AutoConfiguration.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -class ClGemmNativeKernelComponent : public IClKernelComponent -{ -public: - ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc, - const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{}) - : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst } - { - } - - ComponentType get_component_type() const override; - std::set<std::string> get_headers_list() const override; - std::string get_additional_macros() const override; - std::string get_component_code() const override; - Window get_window() const override; - ClKernelArgList get_args(); - CLBuildOptions generate_build_options() const override; - std::string generate_config_id() const override; - - virtual std::vector<Link> get_links() const override - { - return { _lhs, _rhs, _bias, _dst }; - } - - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; - - virtual std::string name() const override - { - return "gemm_mm_native_" + std::to_string(id()); - } - -private: - GemmNativeDescriptor _desc{}; - Link _lhs{}; - Link _rhs{}; - Link _bias{}; - Link _dst{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h index de02f948e9..c6716a0a23 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h @@ -21,16 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp index 5f023ba528..e0b210f4ed 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" @@ -65,25 +67,36 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const { auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - auto tile_info = _blueprint->impl().get_tile_info(); + // auto tile_info = _blueprint->impl().get_tile_info(); CLBuildOptions build_opts{}; + const auto n0 = _blueprint->impl().get_execution_window().x().step(); + const auto m0 = _blueprint->impl().get_execution_window().y().step(); + const auto partial_m0 = t_dst_info->dimension(0) % m0; + const auto partial_n0 = t_dst_info->dimension(1) % n0; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); - build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0)); return build_opts; } -ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src"); + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst"); +} + +ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { return { { "meta_kernel_id", id() }, - { "src", vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src") }, - { "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") }, + { "src", vtable.get(_src) }, + { "dst", vtable.get(_dst) }, }; } @@ -96,19 +109,26 @@ std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() cons { return R"_( //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- + { + #define _IDST_WIDTH {{dst}}_w + #define _IDST_HEIGHT {{dst}}_h + TILE(uint, M0, 1, dst_indirect_y); - TILE(uint, M0, 1, dst_indirect_y); + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1); - dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}}); - }) + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y); + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); - //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + #undef _IDST_WIDTH + #undef _IDST_HEIGHT + //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + } )_"; } @@ -120,21 +140,24 @@ CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options return build_opts; } -ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); +} + +ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { TagLUT lut{}; - lut["meta_kernel_id"] = id(); - lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src"); - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); + // Arguments and global shared variables + lut["src"] = vtable.get(_src); + lut["dst"] = vtable.get(_dst); // Local build options - auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - lut["dst_w"] = dst_info->dimension(1); - lut["dst_h"] = dst_info->dimension(2); - + lut["meta_kernel_id"] = id(); lut["DST_TENSOR_TYPE"] = "BUFFER"; + const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); lut["DST_DATA_TYPE"] = dst_info->data_type(); return lut; @@ -142,6 +165,4 @@ ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKern } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h index c7da8bd3e8..26883d7fa0 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H @@ -37,21 +39,21 @@ namespace dynamic_fusion class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent { public: - ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst) + ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } { } ComponentType get_component_type() const override; std::string get_component_code() const override; CLBuildOptions generate_build_options() const override; + TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::vector<Link> get_links() const override { return { _src, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; - virtual std::string name() const override { return ""; @@ -65,21 +67,21 @@ private: class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent { public: - ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst) + ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } { } ComponentType get_component_type() const override; std::string get_component_code() const override; CLBuildOptions generate_build_options() const override; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::vector<Link> get_links() const override { return { _src, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; - virtual std::string name() const override { return ""; @@ -93,6 +95,4 @@ private: } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
\ No newline at end of file |