aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2022-03-28 21:32:33 +0100
committerSiCong Li <sicong.li@arm.com>2022-04-13 10:36:30 +0000
commit16c5697085c256c19fb8ba4bef6188d61f30a88b (patch)
tree609bfe2082c939ff37bdf6ef37bc22fc071bd934
parent5d606cccaabdfc435734c9fb51e11f14f3724a23 (diff)
downloadComputeLibrary-16c5697085c256c19fb8ba4bef6188d61f30a88b.tar.gz
Add DirectConvolution2D kernel component for dynamic fusion
Resolves: COMPMID-5156 Change-Id: I438da924cb80d3bce72106b06ca7181e0606bd01 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7399 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp1
-rw-r--r--arm_compute/runtime/CL/CLScheduler.h9
-rw-r--r--filelist.json1
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp24
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h25
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h46
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp398
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h81
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp32
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h1
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp54
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h28
-rw-r--r--src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp25
-rw-r--r--src/runtime/CL/CLScheduler.cpp7
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp199
15 files changed, 880 insertions, 51 deletions
diff --git a/Android.bp b/Android.bp
index 0d53040e18..a440e79ffd 100644
--- a/Android.bp
+++ b/Android.bp
@@ -364,6 +364,7 @@ cc_library_static {
"src/core/Validate.cpp",
"src/core/Version.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
+ "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 7ad59782e7..362d2ba137 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -87,6 +87,15 @@ public:
*/
void default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr);
+ /** Re-initializes the context and command queue used by the scheduler to default values
+ * and sets a default device and kernel path for the @ref CLKernelLibrary.
+ *
+ * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
+ * @param[in] gemm_h (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
+ * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
+ */
+ void default_reinit(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native);
+
/** Schedule the execution of the passed kernel if possible.
*
* @param[in] kernel Kernel to execute.
diff --git a/filelist.json b/filelist.json
index 1af856d03b..44e71c7e69 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2064,6 +2064,7 @@
"experimental": {
"dynamic_fusion": [
"src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
+ "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index 6db1ca4cf5..3e9ed060be 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
@@ -94,6 +94,23 @@ Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescript
{
return Status{};
}
+
+Status add_kcomp_direct_conv(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &,
+ const DirectConvolutionDescriptor &direct_conv2d_desc,
+ ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id)
+{
+ kernel_blueprint.impl().add_component(
+ std::make_unique<ClDirectConvolutionKernelComponent>(
+ &kernel_blueprint,
+ direct_conv2d_desc,
+ SharedVarLink{ src_id, SharedVarIO::Input, kernel_blueprint.impl().group(src_id) },
+ SharedVarLink{ weight_id, SharedVarIO::Input, kernel_blueprint.impl().group(weight_id) },
+ SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
+ SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) }));
+
+ return Status{};
+}
+
Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, ArgumentID src_tile, ArgumentID dst_tile, const StoreType &store_type)
{
switch(store_type)
@@ -105,6 +122,13 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon
SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
break;
+ case StoreType::TStoreIndirectWidthSelect:
+ kernel_blueprint.impl().add_component(
+ std::make_unique<ClStoreIndirectWidthSelectKernelComponent>(
+ &kernel_blueprint,
+ SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
+ SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
+ break;
default:
ARM_COMPUTE_ERROR("Store mode not yet supported.");
}
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 27ab294cc9..23629f47bc 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
@@ -56,7 +56,10 @@ enum class TensorArgType : int
Image_3D_Export_To_ClImage2D,
Tensor_3D,
- Tensor_4D
+ Tensor_4D,
+
+ Tensor_4D_t_Buffer,
+ Tensor_4D_t_Image
};
/** Describes all the info required to add a kernel argument at run time */
struct ClKernelArgRuntimeDescriptor
@@ -100,13 +103,12 @@ struct ClKernelComponentDescriptor
/** Component: Tensor Argument */
struct ClTensorDescriptor
{
- ClTensorDescriptor(ITensorInfo *info, unsigned int dim)
- : tensor_info(info), slice_dim(dim)
+ ClTensorDescriptor(ITensorInfo *info)
+ : tensor_info(info)
{
}
ITensorInfo *tensor_info;
- unsigned int slice_dim;
};
Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &);
@@ -134,7 +136,7 @@ struct GemmNativeDescriptor
};
Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
- ArgumentID input_id, ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id);
+ ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id);
/** Component: Eltwise Add */
struct EltwiseAddDescriptor
@@ -150,6 +152,14 @@ struct ActivationDescriptor
};
Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
+/** Component: Direct Convolution **/
+struct DirectConvolutionDescriptor
+{
+ PadStrideInfo pad_stride_info{};
+};
+Status add_kcomp_direct_conv(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const DirectConvolutionDescriptor &,
+ ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id);
+
enum class ClippingStrategy
{
TOP_LEFT,
@@ -239,8 +249,9 @@ Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint
///// Tuning /////
struct ClExecutionDescriptor
{
- cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */
- cl::NDRange gws{}; /**< Global work-group to be used */
+ cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */
+ cl::NDRange gws{}; /**< Global work-group to be used */
+ bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
};
Status tune_static(ClExecutionDescriptor &, const ClKernelCode &);
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index 4c720ea1aa..e24c742fd7 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -31,6 +31,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/GPUTarget.h"
#include "src/core/common/Macros.h"
+#include "support/Requires.h"
#include "support/StringSupport.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
@@ -198,8 +199,9 @@ public:
{
}
- TagVal(ComponentID id)
- : value{ std::to_string(id) }
+ template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
+ TagVal(T val)
+ : value{ support::cpp11::to_string(val) }
{
}
@@ -208,6 +210,16 @@ public:
{
}
+ TagVal(const char *val)
+ : value{ std::string(val) }
+ {
+ }
+
+ TagVal(const DataType &data_type)
+ : value{ get_cl_type_from_data_type(data_type) }
+ {
+ }
+
std::string value{};
};
using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
@@ -633,21 +645,36 @@ private:
std::string code;
switch(var.desc.tensor_arg_type)
{
+ case TensorArgType::Vector:
+ {
+ code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")";
+ break;
+ }
case TensorArgType::Image:
{
- code += "IMAGE_DECLARATION(" + var.uniq_name + ")";
+ code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")";
break;
}
case TensorArgType::Image_3D:
{
- code += "IMAGE_DECLARATION(" + var.uniq_name + "),\n";
- code += "uint " + var.uniq_name + "_stride_z";
+ code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),";
+ code += "\n uint " + var.uniq_name + "_stride_z";
break;
}
case TensorArgType::Image_3D_Export_To_ClImage2D:
{
- code += "__read_only image2d_t " + var.uniq_name + "_img,\n";
- code += "uint " + var.uniq_name + "_stride_z,\n";
+ code += "\n __read_only image2d_t " + var.uniq_name + "_img,";
+ code += "\n uint " + var.uniq_name + "_stride_z";
+ break;
+ }
+ case TensorArgType::Tensor_4D_t_Buffer:
+ {
+ code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+ break;
+ }
+ case TensorArgType::Tensor_4D_t_Image:
+ {
+ code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
break;
}
default:
@@ -664,7 +691,7 @@ private:
for(const auto &arg : argument_list)
{
- code += "\n " + generate_argument_declaration(arg) + ",";
+ code += generate_argument_declaration(arg) + ",";
}
code[code.length() - 1] = ')';
@@ -674,7 +701,8 @@ private:
std::string generate_global_section() const
{
- std::string code = " uint g_x = get_global_id(0);\n";
+ std::string code = "";
+ code += " uint g_x = get_global_id(0);\n";
code += " uint g_y = get_global_id(1);\n";
code += " uint g_z = get_global_id(2);\n\n";
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
new file mode 100644
index 0000000000..f951ce3d46
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ComponentType ClDirectConvolutionKernelComponent::get_component_type() const
+{
+ return ComponentType::Complex;
+}
+
+std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const
+{
+ return std::set<std::string> { "helpers.h", "tile_helpers.h", "repeat.h" };
+}
+
+Window ClDirectConvolutionKernelComponent::get_window() const
+{
+ const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+ const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+ auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ // Get dst shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info);
+
+ // Output auto initialization if not yet initialized
+ auto_init_if_empty(*dst_info, output_shape,
+ 1,
+ src_info->data_type(),
+ src_info->quantization_info());
+
+ const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
+ const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+ // Create and configure kernel window
+ Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+ const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+ win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+ win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+ return win;
+}
+
+std::string ClDirectConvolutionKernelComponent::get_additional_macros() const
+{
+ return R"_()_"; // no macros
+}
+
+std::string ClDirectConvolutionKernelComponent::get_component_code() const
+{
+ const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+ const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
+
+ ARM_COMPUTE_ERROR_ON_MSG(src_info->data_layout() != DataLayout::NHWC, "Only NHWC data layout is supported by this component.");
+
+ const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
+ const auto k0 = adjust_vec_size(is_data_type_quantized(src_info->data_type()) ? 16u : 8u, src_info->dimension(channel_idx));
+ const bool leftover_loop = (src_info->dimension(channel_idx) % k0) != 0;
+
+ std::string code = R"_(
+ //------------------ START KERNEL {{meta_kernel_id}} ---------------------
+ // IN_0(src) {{src}}
+ // IN_1(wei) {{weight}}
+ // IN_1(bia) {{bias}}
+ // OUT(dst, accum) {{dst}}
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ // Initialize the accumulators
+ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+ {
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+ #define _I{{WEI_WIDTH}} {{WEI_WIDTH}}
+ #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}}
+ #define _ISRC_WIDTH {{src}}_w
+ #define _ISRC_HEIGHT {{src}}_h
+ #define _ISRC_CHANNELS {{src}}_c
+ #define _IDST_WIDTH {{dst_w}}
+ #define _IDST_HEIGHT {{dst_h}}
+ #define _IDST_CHANNELS {{dst_c}}
+ #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}})
+
+ // .v = access the whole vector (OpenCL vector)
+ // .s[x] = access the vector element at position x (scalar access)
+ TILE(int, M0, 1, xi);
+ TILE(int, M0, 1, yi);
+
+ // Convert the linear index to coordinate
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ xi[i].v = ((mout + i) % _IDST_WIDTH) * {{STRIDE_X}};
+ yi[i].v = ((mout + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+ xi[i].v -= {{PAD_LEFT}};
+ yi[i].v -= {{PAD_TOP}};
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ {{dst}}[i].v = 0;
+ })
+
+ uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0);
+
+ for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i)
+ {
+ int ck = 0;
+ int xk = i % _I{{WEI_WIDTH}};
+ int yk = i / _I{{WEI_WIDTH}};
+
+ int k = 0;
+ for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+ {
+ TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+ TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = {{ZERO_VALUE}};
+ })
+
+ // Load tile from the src tensor
+ T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+ // Load tile from the weights tensor
+ T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+ ck += K0;
+ }
+
+ // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+ // This #if directive should be removed in case of dynamic tensor support
+ )_";
+
+ if(leftover_loop)
+ {
+ code += R"_(
+ // Left-over accumulations
+ for(; k < _ISRC_CHANNELS; ++k)
+ {
+ TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+ TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = {{ZERO_VALUE}};
+ })
+
+ // Load tile from the src tensor
+ T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+ // Load tile from the weights tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+ ++ck;
+ }
+ )_";
+ }
+
+ code += R"_(
+ }
+ )_";
+
+ if(bias_info != nullptr)
+ {
+ code += R"_(
+ TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+ T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ADD_BROADCAST_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+ )_";
+ }
+
+ code += R"_(
+ #undef _I{{WEI_WIDTH}}
+ #undef _I{{WEI_HEIGHT}}
+ #undef _ISRC_WIDTH
+ #undef _ISRC_HEIGHT
+ #undef _ISRC_CHANNELS
+ #undef _IDST_WIDTH
+ #undef _IDST_HEIGHT
+ #undef _IDST_CHANNELS
+ #undef _IY_MULTIPLIER
+ }
+
+ // Workaround for the discrepancy between tiles and repeats
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v;
+#if M0 >= 2
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v;
+#endif // M0 >= 2
+#if M0 >= 3
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v;
+#endif // M0 >= 3
+#if M0 >= 4
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v;
+#endif // M0 >= 4
+#if M0 >= 8
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v;
+#endif // M0 >= 8
+#if M0 == 16
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v;
+ VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v;
+#endif // M0 == 16
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+ )_";
+ return code.c_str();
+}
+
+bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
+{
+ if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
+ {
+ return false;
+ }
+
+ // If not floating point
+ if(!is_data_type_float(tensor->data_type()))
+ {
+ return false;
+ }
+
+ if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+ {
+ return false;
+ }
+
+ // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+ if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+ {
+ return false;
+ }
+
+ // Check cl image pitch alignment
+ if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+ {
+ return false;
+ }
+
+ const size_t image_w = tensor->tensor_shape()[0] / 4;
+ const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
+ const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+ const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+ if(image_w > max_image_w || image_h > max_image_h)
+ {
+ return false;
+ }
+
+ return true;
+}
+
+CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const
+{
+ const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+ const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+ const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
+ const DataType data_type = src_info->data_type();
+ const GPUTarget gpu_target = ICLKernel().get_target();
+
+ Window win = get_window();
+
+ const unsigned int n0 = win.x().step();
+ const unsigned int m0 = win.y().step();
+ const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
+ const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0;
+ const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
+
+ // Update the padding for the weights tensor if we can export to cl_image
+ if(export_to_cl_image)
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weight_info);
+ }
+
+ CLBuildOptions build_opts{};
+ build_opts.add_option("-cl-fast-relaxed-math");
+ build_opts.add_option("-DIS_TILED");
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+ return build_opts;
+}
+
+ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const
+{
+ TagLUT lut{};
+
+ const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+ const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+ const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
+ const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ const GPUTarget gpu_target = ICLKernel().get_target();
+ const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
+
+ const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer;
+ lut["meta_kernel_id"] = id();
+ lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src");
+ lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight");
+
+ if(!_bias.is_empty()) // optional bias
+ {
+ lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias");
+ lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
+ }
+ lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
+
+ // Local build options
+ const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
+ const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
+ const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
+
+ lut["dst_w"] = dst_info->dimension(width_idx);
+ lut["dst_h"] = dst_info->dimension(height_idx);
+ lut["dst_c"] = dst_info->dimension(channel_idx);
+
+ lut["ACC_DATA_TYPE"] = src_info->data_type();
+ lut["SRC_DATA_TYPE"] = src_info->data_type();
+ lut["WEI_DATA_TYPE"] = weight_info->data_type();
+
+ lut["SRC_TENSOR_TYPE"] = "BUFFER";
+ lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER";
+
+ lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
+ lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
+
+ lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride());
+ lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride());
+
+ lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left();
+ lut["PAD_TOP"] = _desc.pad_stride_info.pad_top();
+
+ lut["ZERO_VALUE"] = 0;
+
+ return lut;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
new file mode 100644
index 0000000000..10c0e00a58
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
+
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
+
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClDirectConvolutionKernelComponent : public IClKernelComponent
+{
+public:
+ ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc,
+ const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{})
+ : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst }
+ {
+ }
+
+ ComponentType get_component_type() const override;
+ std::set<std::string> get_headers_list() const override;
+ std::string get_additional_macros() const override;
+ std::string get_component_code() const override;
+ Window get_window() const override;
+ ClKernelArgList get_args();
+ CLBuildOptions generate_build_options() const override;
+
+ virtual std::vector<Link> get_links() const override
+ {
+ return { _src, _weight, _bias, _dst };
+ }
+
+ virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+
+ virtual std::string name() const override
+ {
+ return "direct_convolution_" + to_string(_blueprint->impl().get_kernel_argument_info(_src.arg_id)->data_layout()) + "_" + std::to_string(id());
+ }
+
+private:
+ DirectConvolutionDescriptor _desc{};
+ Link _src{};
+ Link _weight{};
+ Link _bias{};
+ Link _dst{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
+
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index bbdf8df0a3..34b735edc9 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -41,7 +41,7 @@ ComponentType ClElementwiseAddKernelComponent::get_component_type() const
std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
{
- return std::set<std::string> { "gemm_helpers.h", "repeat.h" };
+ return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" };
}
Window ClElementwiseAddKernelComponent::get_window() const
@@ -78,6 +78,36 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const
LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \
MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp);
}
+
+ // Workaround for the discrepancy between tiles and repeats
+#if defined(IS_TILED)
+ {{acc}}[0].v = {{acc}}0;
+#if M0 >= 2
+ {{acc}}[1].v = {{acc}}1;
+#endif // M0 >= 2
+#if M0 >= 3
+ {{acc}}[2].v = {{acc}}2;
+#endif // M0 >= 3
+#if M0 >= 4
+ {{acc}}[3].v = {{acc}}3;
+#endif // M0 >= 4
+#if M0 >= 8
+ {{acc}}[4].v = {{acc}}4;
+ {{acc}}[5].v = {{acc}}5;
+ {{acc}}[6].v = {{acc}}6;
+ {{acc}}[7].v = {{acc}}7;
+#endif // M0 >= 8
+#if M0 == 16
+ {{acc}}[8].v = {{acc}}8;
+ {{acc}}[9].v = {{acc}}9;
+ {{acc}}[10].v = {{acc}}A;
+ {{acc}}[11].v = {{acc}}B;
+ {{acc}}[12].v = {{acc}}C;
+ {{acc}}[13].v = {{acc}}D;
+ {{acc}}[14].v = {{acc}}E;
+ {{acc}}[15].v = {{acc}}F;
+#endif // M0 == 16
+#endif // defined(IS_TILED)
//------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
)_";
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
index b751ce237f..de02f948e9 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
@@ -26,6 +26,7 @@
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 2d7b46616f..5f023ba528 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
@@ -86,6 +86,60 @@ ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernel
{ "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") },
};
}
+
+ComponentType ClStoreIndirectWidthSelectKernelComponent::get_component_type() const
+{
+ return ComponentType::Store;
+}
+
+std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() const
+{
+ return R"_(
+ //------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1);
+ dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}});
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y);
+
+ //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+
+)_";
+}
+
+CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options() const
+{
+ CLBuildOptions build_opts{};
+
+ return build_opts;
+}
+
+ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const
+{
+ TagLUT lut{};
+
+ lut["meta_kernel_id"] = id();
+ lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src");
+ lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
+
+ // Local build options
+ auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+
+ lut["dst_w"] = dst_info->dimension(1);
+ lut["dst_h"] = dst_info->dimension(2);
+
+ lut["DST_TENSOR_TYPE"] = "BUFFER";
+ lut["DST_DATA_TYPE"] = dst_info->data_type();
+
+ return lut;
+}
+
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index 8d58da2a0d..c7da8bd3e8 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -62,6 +62,34 @@ private:
Link _dst{};
};
+class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent
+{
+public:
+ ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+ : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
+ {
+ }
+ ComponentType get_component_type() const override;
+ std::string get_component_code() const override;
+ CLBuildOptions generate_build_options() const override;
+
+ virtual std::vector<Link> get_links() const override
+ {
+ return { _src, _dst };
+ }
+
+ virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+
+ virtual std::string name() const override
+ {
+ return "";
+ }
+
+private:
+ Link _src{};
+ Link _dst{};
+};
+
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
index 05912dfd81..472cfb9df0 100644
--- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
+++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
@@ -66,7 +66,6 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe
ARM_COMPUTE_ERROR("Unsupported yet");
break;
}
-
case TensorArgType::Vector:
{
add_1D_tensor_argument(idx, tensor, arg_slice);
@@ -93,7 +92,6 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe
_kernel.setArg(idx++, tensor_image2d);
break;
}
-
case TensorArgType::Image_3D:
{
add_2D_tensor_argument(idx, tensor, arg_slice);
@@ -109,18 +107,34 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
break;
}
-
case TensorArgType::Tensor_3D:
{
add_3D_tensor_argument(idx, tensor, arg_slice);
break;
}
-
case TensorArgType::Tensor_4D:
{
add_4D_tensor_argument(idx, tensor, arg_slice);
break;
}
+ case TensorArgType::Tensor_4D_t_Buffer:
+ {
+ add_4d_tensor_nhwc_argument(idx, tensor);
+ break;
+ }
+ case TensorArgType::Tensor_4D_t_Image:
+ {
+ const size_t image_w = tensor->info()->dimension(0) / 4;
+ const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1);
+ const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
+
+ cl::Image2D tensor_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
+ TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+
+ _kernel.setArg(idx++, tensor_cl_image);
+ add_4d_tensor_nhwc_argument(idx, tensor);
+ break;
+ }
default:
{
ARM_COMPUTE_ERROR("Unsupported");
@@ -140,6 +154,7 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w
Window slice_fixed_z = slice;
slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
+
unsigned int idx = 0;
do
{
@@ -162,7 +177,7 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w
bool use_dummy_work_items = false;
enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
}
- while(window.slide_window_slice_3D(slice));
+ while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice));
}
Status bind_arguments(ITensorPack &, const ClKernelCode &, const TensorBinding &)
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 22bf850d6e..18fd52232d 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -141,6 +141,13 @@ void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_
_cl_tuner = cl_tuner;
}
+void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+{
+ _is_initialised = false;
+
+ default_init(cl_tuner, gemm_h, cl_backend_type);
+}
+
void CLScheduler::set_context(cl::Context context)
{
_context = std::move(context);
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index cb365901da..9e1b4d897b 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -32,8 +32,10 @@
#include "tests/framework/Macros.h"
#include "tests/framework/datasets/Datasets.h"
#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
#include "tests/validation/reference/ElementwiseOperations.h"
#include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/Permute.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "src/core/AccessWindowStatic.h"
@@ -83,7 +85,7 @@ TEST_SUITE(DYNAMIC_FUSION)
TEST_SUITE(ClCompositeKernel)
TEST_SUITE(Validate)
-TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
+TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL)
{
/* Computation:
* out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast)
@@ -100,11 +102,11 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
- const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
- const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
- const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
- const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
- const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
+ const ClTensorDescriptor t_lhs_desc{ &t_lhs_info };
+ const ClTensorDescriptor t_rhs_desc{ &t_rhs_info };
+ const ClTensorDescriptor t_bias_desc{ &t_bias_info };
+ const ClTensorDescriptor t_addend_desc{ &t_dst_info };
+ const ClTensorDescriptor t_dst_desc{ &t_dst_info };
ClKernelBlueprint bp;
ArgumentID tid_lhs;
@@ -134,10 +136,10 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
st = set_tile_info(bp, store_tile_info);
st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
- ClExecutionDescriptor exec_desc;
+ ClExecutionDescriptor exec_desc{};
st = tune_static(exec_desc, cl_code);
- CLScheduler::get().default_init();
+ CLScheduler::get().default_reinit();
ClCompositeKernel kernel;
kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
@@ -193,10 +195,149 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32);
}
+TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out = add(addend, direct_conv2d(lhs, rhs, bias)) (non-broadcast)
+ */
+
+ ClCompositeKernel kernel{};
+ ClKernelBlueprint bp{};
+ ClKernelCode cl_code{};
+ ClExecutionDescriptor exec_desc{};
+ Status st{};
+
+ const auto data_type = DataType::F32;
+ const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U);
+
+ const auto width = 7U;
+ const auto height = 6U;
+ const auto IFM = 5U;
+ const auto OFM = 4U;
+ const auto kernel_sz = 3U;
+
+ const auto src_shape = TensorShape(IFM, width, height);
+ const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM);
+ const auto bia_shape = TensorShape(OFM);
+ const auto dst_shape = TensorShape(OFM, width, height);
+
+ auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC);
+ auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC);
+ auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC);
+ auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC);
+
+ const auto src_desc = ClTensorDescriptor(&src_info);
+ const auto wei_desc = ClTensorDescriptor(&wei_info);
+ const auto bia_desc = ClTensorDescriptor(&bia_info);
+ const auto addend_desc = ClTensorDescriptor(&dst_info);
+ const auto dst_desc = ClTensorDescriptor(&dst_info);
+
+ const auto n0 = std::min(OFM, 4u);
+ const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
+
+ const ClKernelComponentDescriptor common_kernel_desc{};
+ const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info };
+ const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
+ const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
+
+ ArgumentID src_id{ g_arg_placeholder };
+ ArgumentID wei_id{ g_arg_placeholder };
+ ArgumentID bia_id{ g_arg_placeholder };
+ ArgumentID acc_id{ g_arg_placeholder };
+ ArgumentID addend_id{ g_arg_placeholder };
+ ArgumentID dst_id{ g_arg_placeholder };
+
+ st = add_tensor_argument(bp, src_desc, src_id);
+ st = add_tensor_argument(bp, wei_desc, wei_id);
+ st = add_tensor_argument(bp, bia_desc, bia_id);
+ st = add_tensor_intermed(bp, acc_id);
+ st = add_tensor_argument(bp, addend_desc, addend_id);
+ st = add_tensor_argument(bp, dst_desc, dst_id);
+
+ st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
+ st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id);
+ st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect);
+
+ exec_desc.skip_sliding_window = true;
+
+ st = set_tile_info(bp, store_tile_info);
+ st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
+ st = tune_static(exec_desc, cl_code);
+
+ CLScheduler::get().default_reinit();
+ kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
+
+ // Construct tensors
+ CLTensor src{};
+ CLTensor wei{};
+ CLTensor bia{};
+ CLTensor addend{};
+ CLTensor dst{};
+
+ // Init tensors
+ src.allocator()->init(src_info);
+ wei.allocator()->init(wei_info);
+ bia.allocator()->init(bia_info);
+ addend.allocator()->init(dst_info);
+ dst.allocator()->init(dst_info);
+
+ // "Pack" tensors
+ TensorBinding tensors({ { src_id, &src },
+ { wei_id, &wei },
+ { bia_id, &bia },
+ { addend_id, &addend },
+ { dst_id, &dst }
+ });
+
+ // Allocate and fill tensors
+ src.allocator()->allocate();
+ wei.allocator()->allocate();
+ bia.allocator()->allocate();
+ addend.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ fill<float>(CLAccessor(src), 0);
+ fill<float>(CLAccessor(wei), 1);
+ fill<float>(CLAccessor(bia), 2);
+ fill<float>(CLAccessor(addend), 3);
+
+ CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
+
+ // Create reference
+ SimpleTensor<float> ref_src_nhwc{ src_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_wei_nhwc{ wei_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_bia_nhwc{ bia_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+ // Fill reference
+ fill<float>(ref_src_nhwc, 0);
+ fill<float>(ref_wei_nhwc, 1);
+ fill<float>(ref_bia_nhwc, 2);
+ fill<float>(ref_addend_nhwc, 3);
+
+ auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U));
+ auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U));
+ auto ref_bia = reference::permute(ref_bia_nhwc, PermutationVector(1U, 2U, 0U));
+ auto ref_addend = reference::permute(ref_addend_nhwc, PermutationVector(1U, 2U, 0U));
+
+ TensorShape dst_shape_nchw{ dst_shape };
+ permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+ const auto ref_dst = reference::arithmetic_operation(
+ ArithmeticOperation::ADD,
+ ref_addend,
+ reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info),
+ data_type,
+ eltwise_add_desc.convert_policy);
+
+ RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+ validate(CLAccessor(dst), ref_dst, tolerance_f32);
+}
+
TEST_SUITE_END() // Validate
TEST_SUITE(Benchmark)
-TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
+TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL)
{
using std::chrono::duration_cast;
using std::chrono::microseconds;
@@ -205,19 +346,19 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
/* Computation:
* out = add(addend, gemm_native(lhs, rhs, bias))
*/
- const auto data_type = DataType::F32;
- const unsigned int m = 12 * 12;
- const unsigned int n = 64;
- const unsigned int k = 384;
- const auto t_lhs_shape = TensorShape(k, m);
- const auto t_rhs_shape = TensorShape(n, k);
- const auto t_dst_shape = TensorShape(n, m);
- auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
- auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
- auto t_bias_info = TensorInfo(TensorShape(), 1, data_type);
- auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
- auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
- auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
+ const auto data_type = DataType::F32;
+ const auto m = 12U * 12U;
+ const auto n = 64U;
+ const auto k = 384U;
+ const auto t_lhs_shape = TensorShape(k, m);
+ const auto t_rhs_shape = TensorShape(n, k);
+ const auto t_dst_shape = TensorShape(n, m);
+ auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
+ auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
+ auto t_bias_info = TensorInfo(TensorShape(), 1, data_type);
+ auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
+ auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
+ auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
const auto common_kernel_desc = ClKernelComponentDescriptor{};
const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
@@ -242,7 +383,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
data_type,
eltwise_add_desc.convert_policy);
- CLScheduler::get().default_init();
+ CLScheduler::get().default_reinit();
/* Condition 0: Dynamic Fused Kernel */
CLTensor cond0_t_dst{};
@@ -256,11 +397,11 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
ArgumentID tid_l1_addend;
ArgumentID tid_dst;
- const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 };
- const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 };
- const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 };
- const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 };
- const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 };
+ const ClTensorDescriptor t_lhs_desc{ &t_lhs_info };
+ const ClTensorDescriptor t_rhs_desc{ &t_rhs_info };
+ const ClTensorDescriptor t_bias_desc{ &t_bias_info };
+ const ClTensorDescriptor t_addend_desc{ &t_dst_info };
+ const ClTensorDescriptor t_dst_desc{ &t_dst_info };
ClKernelCode cl_code;
TICK(cond0_build_time)
@@ -282,7 +423,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL)
TOCK(cond0_build_time, measurements)
TICK(cond0_tune_time)
- ClExecutionDescriptor exec_desc;
+ ClExecutionDescriptor exec_desc{};
st = tune_static(exec_desc, cl_code);
TOCK(cond0_tune_time, measurements)