aboutsummaryrefslogtreecommitdiff
path: root/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/experimental/dynamic_fusion/ClKernelBuildingImpl')
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h59
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp23
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h6
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp89
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h19
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h4
6 files changed, 186 insertions, 14 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index 3b5160a055..b285cc2b54 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -29,6 +29,7 @@
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GPUTarget.h"
+#include "src/core/common/Macros.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
@@ -63,8 +64,8 @@ enum class SharedVarGroup
Automatic // Automatic variables declared within the kernel body
};
-/** Specifies a shared variable ink for a component.
- * It describes all the information that's availbale when a component is constructed / added:
+/** Specifies a shared variable link for a component.
+ * It describes all the information that's available when a component is constructed / added:
* e.g. its linkage (via ArgumentID and io) and its group
* This is not shared variable on its own, but is used for instantiating a SharedVar when building the code
*/
@@ -204,6 +205,13 @@ public:
};
using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
public:
+ IClKernelComponent(const ClKernelBlueprint *blueprint)
+ : _blueprint(blueprint)
+ {
+ }
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClKernelComponent);
+
virtual ~IClKernelComponent() = default;
virtual ComponentType get_component_type() const = 0;
virtual std::vector<Link> get_links() const = 0;
@@ -278,6 +286,11 @@ public:
{
return "";
}
+
+ virtual Window get_window() const
+ {
+ return Window{};
+ }
/** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code
*
* @param vtable
@@ -290,6 +303,9 @@ public:
return "";
}
+protected:
+ const ClKernelBlueprint *_blueprint;
+
private:
ComponentID _id{};
};
@@ -398,6 +414,12 @@ public:
// Additionally, set this component as one that treats this argument as "Output" (append to index 1)
else
{
+ if(component->get_component_type() == ComponentType::Store)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(_dst_id >= 0, "Trying to add more than one dst argument to the graph");
+ _dst_id = arg_id;
+ }
+
for(const auto &subseq_component : _outgoing_components[arg_id])
{
_component_graph[component_id].push_back(subseq_component);
@@ -430,7 +452,6 @@ public:
stack.pop();
}
- std::cout << name << std::endl;
return name;
}
@@ -508,7 +529,15 @@ public:
Window get_execution_window() const
{
- return Window{};
+ ARM_COMPUTE_ERROR_ON_MSG(_graph_root < 0, "No root found in the component graph");
+ ARM_COMPUTE_ERROR_ON_MSG(_dst_id == -1, "Destination Tensor Id should be ready before calling get_execution_window()");
+
+ return _components.find(_graph_root)->second->get_window();
+ }
+
+ ArgumentID get_dst_id() const
+ {
+ return _dst_id;
}
ClKernelArgList get_arguments() const
@@ -521,6 +550,26 @@ public:
return arg_list;
}
+ const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const
+ {
+ auto it = _kernel_arguments.find(id);
+ if(it != _kernel_arguments.end())
+ {
+ return &_kernel_arguments.find(id)->second;
+ }
+ return nullptr;
+ }
+
+ ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+ {
+ const ClTensorDescriptor *arg_desc = get_kernel_argument(id);
+ if(arg_desc != nullptr)
+ {
+ return arg_desc->tensor_info;
+ }
+ return nullptr;
+ }
+
private:
void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
{
@@ -635,6 +684,8 @@ private:
int32_t _num_components{};
int32_t _num_complex_components{};
+ ArgumentID _dst_id{ -1 };
+
// Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
std::unordered_map<ArgumentID, ClTensorDescriptor> _kernel_arguments{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index a44b5faee2..06c29c4253 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -24,6 +24,9 @@
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
{
@@ -41,6 +44,26 @@ std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
return std::set<std::string> { "gemm_helpers.h", "repeat.h" };
}
+Window ClElementwiseAddKernelComponent::get_window() const
+{
+ const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+ const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+ ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
+
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info);
+ const TensorShape &out_shape = broadcast_pair.first;
+
+ auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type());
+
+ const unsigned int vector_size_byte_opencl = 16;
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
+ Window win = calculate_max_window(*dst_info, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
std::string ClElementwiseAddKernelComponent::get_component_code() const
{
std::string code;
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index c0de4ac9b8..fe5f964c54 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -37,13 +37,15 @@ namespace dynamic_fusion
class ClElementwiseAddKernelComponent : public IClKernelComponent
{
public:
- ClElementwiseAddKernelComponent(const Link &lhs, const Link &rhs, const Link &dst)
- : _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
+ ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
+ : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
{
}
+
ComponentType get_component_type() const override;
std::set<std::string> get_headers_list() const override;
std::string get_component_code() const override;
+ Window get_window() const override;
virtual std::vector<Link> get_links() const override
{
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index 1521973d55..e70e5d5ea5 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -24,6 +24,9 @@
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
namespace arm_compute
{
@@ -41,6 +44,92 @@ std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const
return std::set<std::string> { "./common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" };
}
+Window ClGemmNativeKernelComponent::get_window() const
+{
+ ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
+ ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
+ ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
+ ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
+
+ bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d;
+ bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0;
+
+ Window win{};
+ Window win_out{};
+ bool window_changed = false;
+
+ // In case both input and dst have to be reinterpreted as 3D tensors,
+ // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+ if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+ {
+ reinterpret_output_as_3d = false;
+ }
+
+ // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation
+ GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d,
+ _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width,
+ _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset);
+
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info)));
+
+ TensorInfo tmp_info(*dst_info);
+
+ if(reinterpret_output_as_3d)
+ {
+ // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+ // the window needs to be constructed on the 2D collapsed version of the tensor
+ TensorShape tmp_shape(dst_info->tensor_shape());
+ tmp_shape.collapse(2U, 1U);
+ tmp_info.set_tensor_shape(tmp_shape);
+ }
+
+ win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
+ win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
+
+ AccessWindowStatic src0_access(lhs_info, 0, 0,
+ lhs_info->dimension(0),
+ lhs_info->dimension(1));
+ AccessWindowStatic src1_access(rhs_info, 0, 0,
+ ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0),
+ rhs_info->dimension(1));
+ AccessWindowStatic dst_access(dst_info, 0, 0,
+ dst_info->dimension(0),
+ dst_info->dimension(1));
+
+ if(bias_info != nullptr)
+ {
+ const int bias_processed_per_iteration_x = _desc.rhs_info.n0;
+
+ AccessWindowStatic src2_access(bias_info, 0, 0,
+ ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x),
+ bias_info->dimension(1));
+
+ window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+ update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor
+ }
+
+ // Collapse along the Z direction
+ // This collapse needs to be here in order to tune the Z dimension of LWS
+ Window collapsed = win;
+ const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u);
+ collapsed = win.collapse(win, dimension_to_collapse);
+
+ if(window_changed == true)
+ {
+ ARM_COMPUTE_ERROR("Insufficient Padding!");
+ }
+
+ return collapsed;
+}
+
std::string ClGemmNativeKernelComponent::get_additional_macros() const
{
return R"_(
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 38f007c07c..09933a8932 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -26,7 +26,10 @@
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
+#include "src/core/helpers/AutoConfiguration.h"
namespace arm_compute
{
@@ -37,14 +40,17 @@ namespace dynamic_fusion
class ClGemmNativeKernelComponent : public IClKernelComponent
{
public:
- ClGemmNativeKernelComponent(const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
- : _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
+ ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc,
+ const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
+ : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
{
}
+
ComponentType get_component_type() const override;
std::set<std::string> get_headers_list() const override;
std::string get_additional_macros() const override;
std::string get_component_code() const override;
+ Window get_window() const override;
ClKernelArgList get_args();
virtual std::vector<Link> get_links() const override
@@ -60,10 +66,11 @@ public:
}
private:
- Link _lhs{};
- Link _rhs{};
- Link _bias{};
- Link _dst{};
+ GemmNativeDescriptor _desc{};
+ Link _lhs{};
+ Link _rhs{};
+ Link _bias{};
+ Link _dst{};
};
} // namespace dynamic_fusion
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index f0d01d30a9..ad7a207ef8 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -37,8 +37,8 @@ namespace dynamic_fusion
class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
{
public:
- ClStoreBlockBoundaryAwareKernelComponent(const Link &src, const Link &dst)
- : _src{ src }, _dst{ dst }
+ ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+ : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
{
}
ComponentType get_component_type() const override;