From 16c5697085c256c19fb8ba4bef6188d61f30a88b Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Mon, 28 Mar 2022 21:32:33 +0100 Subject: Add DirectConvolution2D kernel component for dynamic fusion Resolves: COMPMID-5156 Change-Id: I438da924cb80d3bce72106b06ca7181e0606bd01 Signed-off-by: Gunes Bayir Signed-off-by: Giorgio Arena Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7399 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 1 + arm_compute/runtime/CL/CLScheduler.h | 9 + filelist.json | 1 + .../dynamic_fusion/ClKernelBuildingAPI.cpp | 24 ++ .../dynamic_fusion/ClKernelBuildingAPI.h | 25 +- .../dynamic_fusion/ClKernelBuildingImpl/Common.h | 46 ++- .../ClDirectConvolutionKernelComponent.cpp | 398 +++++++++++++++++++++ .../ClDirectConvolutionKernelComponent.h | 81 +++++ .../components/ClElementwiseAddKernelComponent.cpp | 32 +- .../components/ClKernelComponents.h | 1 + .../components/ClStoreKernelComponents.cpp | 54 +++ .../components/ClStoreKernelComponents.h | 28 ++ .../dynamic_fusion/ClCompositeKernel.cpp | 25 +- src/runtime/CL/CLScheduler.cpp | 7 + .../CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | 199 +++++++++-- 15 files changed, 880 insertions(+), 51 deletions(-) create mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp create mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h diff --git a/Android.bp b/Android.bp index 0d53040e18..a440e79ffd 100644 --- a/Android.bp +++ b/Android.bp @@ -364,6 +364,7 @@ cc_library_static { "src/core/Validate.cpp", "src/core/Version.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp", + "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp", diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h index 7ad59782e7..362d2ba137 100644 --- a/arm_compute/runtime/CL/CLScheduler.h +++ b/arm_compute/runtime/CL/CLScheduler.h @@ -87,6 +87,15 @@ public: */ void default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr); + /** Re-initializes the context and command queue used by the scheduler to default values + * and sets a default device and kernel path for the @ref CLKernelLibrary. + * + * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr) + * @param[in] gemm_h (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr) + * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native) + */ + void default_reinit(ICLTuner *cl_tuner = nullptr, CLGEMMHeuristicsHandle *gemm_h = nullptr, CLBackendType cl_backend_type = CLBackendType::Native); + /** Schedule the execution of the passed kernel if possible. * * @param[in] kernel Kernel to execute. diff --git a/filelist.json b/filelist.json index 1af856d03b..44e71c7e69 100644 --- a/filelist.json +++ b/filelist.json @@ -2064,6 +2064,7 @@ "experimental": { "dynamic_fusion": [ "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp", + "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp", diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp index 6db1ca4cf5..3e9ed060be 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp @@ -94,6 +94,23 @@ Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescript { return Status{}; } + +Status add_kcomp_direct_conv(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, + const DirectConvolutionDescriptor &direct_conv2d_desc, + ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id) +{ + kernel_blueprint.impl().add_component( + std::make_unique( + &kernel_blueprint, + direct_conv2d_desc, + SharedVarLink{ src_id, SharedVarIO::Input, kernel_blueprint.impl().group(src_id) }, + SharedVarLink{ weight_id, SharedVarIO::Input, kernel_blueprint.impl().group(weight_id) }, + SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }, + SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) })); + + return Status{}; +} + Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, ArgumentID src_tile, ArgumentID dst_tile, const StoreType &store_type) { switch(store_type) @@ -105,6 +122,13 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) }, SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) })); break; + case StoreType::TStoreIndirectWidthSelect: + kernel_blueprint.impl().add_component( + std::make_unique( + &kernel_blueprint, + SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) }, + SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) })); + break; default: ARM_COMPUTE_ERROR("Store mode not yet supported."); } diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h index 27ab294cc9..23629f47bc 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h @@ -56,7 +56,10 @@ enum class TensorArgType : int Image_3D_Export_To_ClImage2D, Tensor_3D, - Tensor_4D + Tensor_4D, + + Tensor_4D_t_Buffer, + Tensor_4D_t_Image }; /** Describes all the info required to add a kernel argument at run time */ struct ClKernelArgRuntimeDescriptor @@ -100,13 +103,12 @@ struct ClKernelComponentDescriptor /** Component: Tensor Argument */ struct ClTensorDescriptor { - ClTensorDescriptor(ITensorInfo *info, unsigned int dim) - : tensor_info(info), slice_dim(dim) + ClTensorDescriptor(ITensorInfo *info) + : tensor_info(info) { } ITensorInfo *tensor_info; - unsigned int slice_dim; }; Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &); @@ -134,7 +136,7 @@ struct GemmNativeDescriptor }; Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, - ArgumentID input_id, ArgumentID weights_id, ArgumentID bias_id, ArgumentID &dst_id); + ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id); /** Component: Eltwise Add */ struct EltwiseAddDescriptor @@ -150,6 +152,14 @@ struct ActivationDescriptor }; Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID src_id, ArgumentID &dst_id); +/** Component: Direct Convolution **/ +struct DirectConvolutionDescriptor +{ + PadStrideInfo pad_stride_info{}; +}; +Status add_kcomp_direct_conv(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const DirectConvolutionDescriptor &, + ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id); + enum class ClippingStrategy { TOP_LEFT, @@ -239,8 +249,9 @@ Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint ///// Tuning ///// struct ClExecutionDescriptor { - cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */ - cl::NDRange gws{}; /**< Global work-group to be used */ + cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */ + cl::NDRange gws{}; /**< Global work-group to be used */ + bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */ }; Status tune_static(ClExecutionDescriptor &, const ClKernelCode &); diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h index 4c720ea1aa..e24c742fd7 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h @@ -31,6 +31,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/GPUTarget.h" #include "src/core/common/Macros.h" +#include "support/Requires.h" #include "support/StringSupport.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" @@ -198,8 +199,9 @@ public: { } - TagVal(ComponentID id) - : value{ std::to_string(id) } + template ::value)> + TagVal(T val) + : value{ support::cpp11::to_string(val) } { } @@ -208,6 +210,16 @@ public: { } + TagVal(const char *val) + : value{ std::string(val) } + { + } + + TagVal(const DataType &data_type) + : value{ get_cl_type_from_data_type(data_type) } + { + } + std::string value{}; }; using TagLUT = std::unordered_map; // Used to instantiating a code template / replacing tags @@ -633,21 +645,36 @@ private: std::string code; switch(var.desc.tensor_arg_type) { + case TensorArgType::Vector: + { + code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")"; + break; + } case TensorArgType::Image: { - code += "IMAGE_DECLARATION(" + var.uniq_name + ")"; + code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")"; break; } case TensorArgType::Image_3D: { - code += "IMAGE_DECLARATION(" + var.uniq_name + "),\n"; - code += "uint " + var.uniq_name + "_stride_z"; + code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),"; + code += "\n uint " + var.uniq_name + "_stride_z"; break; } case TensorArgType::Image_3D_Export_To_ClImage2D: { - code += "__read_only image2d_t " + var.uniq_name + "_img,\n"; - code += "uint " + var.uniq_name + "_stride_z,\n"; + code += "\n __read_only image2d_t " + var.uniq_name + "_img,"; + code += "\n uint " + var.uniq_name + "_stride_z"; + break; + } + case TensorArgType::Tensor_4D_t_Buffer: + { + code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)"; + break; + } + case TensorArgType::Tensor_4D_t_Image: + { + code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)"; break; } default: @@ -664,7 +691,7 @@ private: for(const auto &arg : argument_list) { - code += "\n " + generate_argument_declaration(arg) + ","; + code += generate_argument_declaration(arg) + ","; } code[code.length() - 1] = ')'; @@ -674,7 +701,8 @@ private: std::string generate_global_section() const { - std::string code = " uint g_x = get_global_id(0);\n"; + std::string code = ""; + code += " uint g_x = get_global_id(0);\n"; code += " uint g_y = get_global_id(1);\n"; code += " uint g_z = get_global_id(2);\n\n"; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp new file mode 100644 index 0000000000..f951ce3d46 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) + +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/ICLKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ComponentType ClDirectConvolutionKernelComponent::get_component_type() const +{ + return ComponentType::Complex; +} + +std::set ClDirectConvolutionKernelComponent::get_headers_list() const +{ + return std::set { "helpers.h", "tile_helpers.h", "repeat.h" }; +} + +Window ClDirectConvolutionKernelComponent::get_window() const +{ + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + // Get dst shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst_info, output_shape, + 1, + src_info->data_type(), + src_info->quantization_info()); + + const unsigned int vec_size = std::min(static_cast(dst_info->tensor_shape()[0]), 4u); + const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U; + + // Create and configure kernel window + Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + + const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows); + win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows)); + win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1)); + + return win; +} + +std::string ClDirectConvolutionKernelComponent::get_additional_macros() const +{ + return R"_()_"; // no macros +} + +std::string ClDirectConvolutionKernelComponent::get_component_code() const +{ + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); + + ARM_COMPUTE_ERROR_ON_MSG(src_info->data_layout() != DataLayout::NHWC, "Only NHWC data layout is supported by this component."); + + const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); + const auto k0 = adjust_vec_size(is_data_type_quantized(src_info->data_type()) ? 16u : 8u, src_info->dimension(channel_idx)); + const bool leftover_loop = (src_info->dimension(channel_idx) % k0) != 0; + + std::string code = R"_( + //------------------ START KERNEL {{meta_kernel_id}} --------------------- + // IN_0(src) {{src}} + // IN_1(wei) {{weight}} + // IN_1(bia) {{bias}} + // OUT(dst, accum) {{dst}} + + const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM + const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // Initialize the accumulators + TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); + { + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. + #define _I{{WEI_WIDTH}} {{WEI_WIDTH}} + #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}} + #define _ISRC_WIDTH {{src}}_w + #define _ISRC_HEIGHT {{src}}_h + #define _ISRC_CHANNELS {{src}}_c + #define _IDST_WIDTH {{dst_w}} + #define _IDST_HEIGHT {{dst_h}} + #define _IDST_CHANNELS {{dst_c}} + #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}) + + // .v = access the whole vector (OpenCL vector) + // .s[x] = access the vector element at position x (scalar access) + TILE(int, M0, 1, xi); + TILE(int, M0, 1, yi); + + // Convert the linear index to coordinate + LOOP_UNROLLING(int, i, 0, 1, M0, + { + xi[i].v = ((mout + i) % _IDST_WIDTH) * {{STRIDE_X}}; + yi[i].v = ((mout + i) / _IDST_WIDTH) * {{STRIDE_Y}}; + xi[i].v -= {{PAD_LEFT}}; + yi[i].v -= {{PAD_TOP}}; + }) + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + {{dst}}[i].v = 0; + }) + + uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0); + + for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i) + { + int ck = 0; + int xk = i % _I{{WEI_WIDTH}}; + int yk = i / _I{{WEI_WIDTH}}; + + int k = 0; + for(; k <= (_ISRC_CHANNELS - K0); k += K0) + { + TILE({{SRC_DATA_TYPE}}, M0, K0, a); + TILE({{WEI_DATA_TYPE}}, N0, K0, b); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = {{ZERO_VALUE}}; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); + + // Load tile from the weights tensor + T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}}); + + ck += K0; + } + + // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS + // This #if directive should be removed in case of dynamic tensor support + )_"; + + if(leftover_loop) + { + code += R"_( + // Left-over accumulations + for(; k < _ISRC_CHANNELS; ++k) + { + TILE({{SRC_DATA_TYPE}}, M0, 1, a); + TILE({{WEI_DATA_TYPE}}, N0, 1, b); + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + a[i].v = {{ZERO_VALUE}}; + }) + + // Load tile from the src tensor + T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); + + // Load tile from the weights tensor + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); + + // Compute the matrix multiplication between two tiles + T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); + + ++ck; + } + )_"; + } + + code += R"_( + } + )_"; + + if(bias_info != nullptr) + { + code += R"_( + TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); + + T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); + )_"; + } + + code += R"_( + #undef _I{{WEI_WIDTH}} + #undef _I{{WEI_HEIGHT}} + #undef _ISRC_WIDTH + #undef _ISRC_HEIGHT + #undef _ISRC_CHANNELS + #undef _IDST_WIDTH + #undef _IDST_HEIGHT + #undef _IDST_CHANNELS + #undef _IY_MULTIPLIER + } + + // Workaround for the discrepancy between tiles and repeats + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v; +#if M0 >= 2 + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v; +#endif // M0 >= 2 +#if M0 >= 3 + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v; +#endif // M0 >= 3 +#if M0 >= 4 + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v; +#endif // M0 >= 4 +#if M0 >= 8 + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v; +#endif // M0 >= 8 +#if M0 == 16 + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v; + VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v; +#endif // M0 == 16 +//------------------ END KERNEL {{meta_kernel_id}} --------------------- + )_"; + return code.c_str(); +} + +bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) +{ + if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) + { + return false; + } + + // If not floating point + if(!is_data_type_float(tensor->data_type())) + { + return false; + } + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform + if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + return false; + } + + // Check cl image pitch alignment + if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + { + return false; + } + + const size_t image_w = tensor->tensor_shape()[0] / 4; + const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; + const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + if(image_w > max_image_w || image_h > max_image_h) + { + return false; + } + + return true; +} + +CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const +{ + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); + const DataType data_type = src_info->data_type(); + const GPUTarget gpu_target = ICLKernel().get_target(); + + Window win = get_window(); + + const unsigned int n0 = win.x().step(); + const unsigned int m0 = win.y().step(); + const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx)); + const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0; + const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); + + // Update the padding for the weights tensor if we can export to cl_image + if(export_to_cl_image) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weight_info); + } + + CLBuildOptions build_opts{}; + build_opts.add_option("-cl-fast-relaxed-math"); + build_opts.add_option("-DIS_TILED"); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(k0)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const +{ + TagLUT lut{}; + + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); + const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + const GPUTarget gpu_target = ICLKernel().get_target(); + const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); + + const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer; + lut["meta_kernel_id"] = id(); + lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src"); + lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight"); + + if(!_bias.is_empty()) // optional bias + { + lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias"); + lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type()); + } + lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); + + // Local build options + const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); + const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); + const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); + + lut["dst_w"] = dst_info->dimension(width_idx); + lut["dst_h"] = dst_info->dimension(height_idx); + lut["dst_c"] = dst_info->dimension(channel_idx); + + lut["ACC_DATA_TYPE"] = src_info->data_type(); + lut["SRC_DATA_TYPE"] = src_info->data_type(); + lut["WEI_DATA_TYPE"] = weight_info->data_type(); + + lut["SRC_TENSOR_TYPE"] = "BUFFER"; + lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER"; + + lut["WEI_WIDTH"] = weight_info->dimension(width_idx); + lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); + + lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride()); + lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride()); + + lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left(); + lut["PAD_TOP"] = _desc.pad_stride_info.pad_top(); + + lut["ZERO_VALUE"] = 0; + + return lut; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h new file mode 100644 index 0000000000..10c0e00a58 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) + +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H + +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" + +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClDirectConvolutionKernelComponent : public IClKernelComponent +{ +public: + ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc, + const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{}) + : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst } + { + } + + ComponentType get_component_type() const override; + std::set get_headers_list() const override; + std::string get_additional_macros() const override; + std::string get_component_code() const override; + Window get_window() const override; + ClKernelArgList get_args(); + CLBuildOptions generate_build_options() const override; + + virtual std::vector get_links() const override + { + return { _src, _weight, _bias, _dst }; + } + + virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; + + virtual std::string name() const override + { + return "direct_convolution_" + to_string(_blueprint->impl().get_kernel_argument_info(_src.arg_id)->data_layout()) + "_" + std::to_string(id()); + } + +private: + DirectConvolutionDescriptor _desc{}; + Link _src{}; + Link _weight{}; + Link _bias{}; + Link _dst{}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H + +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp index bbdf8df0a3..34b735edc9 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp @@ -41,7 +41,7 @@ ComponentType ClElementwiseAddKernelComponent::get_component_type() const std::set ClElementwiseAddKernelComponent::get_headers_list() const { - return std::set { "gemm_helpers.h", "repeat.h" }; + return std::set { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" }; } Window ClElementwiseAddKernelComponent::get_window() const @@ -78,6 +78,36 @@ std::string ClElementwiseAddKernelComponent::get_component_code() const LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \ MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp); } + + // Workaround for the discrepancy between tiles and repeats +#if defined(IS_TILED) + {{acc}}[0].v = {{acc}}0; +#if M0 >= 2 + {{acc}}[1].v = {{acc}}1; +#endif // M0 >= 2 +#if M0 >= 3 + {{acc}}[2].v = {{acc}}2; +#endif // M0 >= 3 +#if M0 >= 4 + {{acc}}[3].v = {{acc}}3; +#endif // M0 >= 4 +#if M0 >= 8 + {{acc}}[4].v = {{acc}}4; + {{acc}}[5].v = {{acc}}5; + {{acc}}[6].v = {{acc}}6; + {{acc}}[7].v = {{acc}}7; +#endif // M0 >= 8 +#if M0 == 16 + {{acc}}[8].v = {{acc}}8; + {{acc}}[9].v = {{acc}}9; + {{acc}}[10].v = {{acc}}A; + {{acc}}[11].v = {{acc}}B; + {{acc}}[12].v = {{acc}}C; + {{acc}}[13].v = {{acc}}D; + {{acc}}[14].v = {{acc}}E; + {{acc}}[15].v = {{acc}}F; +#endif // M0 == 16 +#endif // defined(IS_TILED) //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- )_"; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h index b751ce237f..de02f948e9 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h @@ -26,6 +26,7 @@ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp index 2d7b46616f..5f023ba528 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp @@ -86,6 +86,60 @@ ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernel { "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") }, }; } + +ComponentType ClStoreIndirectWidthSelectKernelComponent::get_component_type() const +{ + return ComponentType::Store; +} + +std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() const +{ + return R"_( + //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- + + TILE(uint, M0, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1); + dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}}); + }) + + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y); + + //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + +)_"; +} + +CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options() const +{ + CLBuildOptions build_opts{}; + + return build_opts; +} + +ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const +{ + TagLUT lut{}; + + lut["meta_kernel_id"] = id(); + lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src"); + lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); + + // Local build options + auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + + lut["dst_w"] = dst_info->dimension(1); + lut["dst_h"] = dst_info->dimension(2); + + lut["DST_TENSOR_TYPE"] = "BUFFER"; + lut["DST_DATA_TYPE"] = dst_info->data_type(); + + return lut; +} + } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h index 8d58da2a0d..c7da8bd3e8 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h @@ -62,6 +62,34 @@ private: Link _dst{}; }; +class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent +{ +public: + ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst) + : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } + { + } + ComponentType get_component_type() const override; + std::string get_component_code() const override; + CLBuildOptions generate_build_options() const override; + + virtual std::vector get_links() const override + { + return { _src, _dst }; + } + + virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; + + virtual std::string name() const override + { + return ""; + } + +private: + Link _src{}; + Link _dst{}; +}; + } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp index 05912dfd81..472cfb9df0 100644 --- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp +++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp @@ -66,7 +66,6 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe ARM_COMPUTE_ERROR("Unsupported yet"); break; } - case TensorArgType::Vector: { add_1D_tensor_argument(idx, tensor, arg_slice); @@ -93,7 +92,6 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe _kernel.setArg(idx++, tensor_image2d); break; } - case TensorArgType::Image_3D: { add_2D_tensor_argument(idx, tensor, arg_slice); @@ -109,18 +107,34 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); break; } - case TensorArgType::Tensor_3D: { add_3D_tensor_argument(idx, tensor, arg_slice); break; } - case TensorArgType::Tensor_4D: { add_4D_tensor_argument(idx, tensor, arg_slice); break; } + case TensorArgType::Tensor_4D_t_Buffer: + { + add_4d_tensor_nhwc_argument(idx, tensor); + break; + } + case TensorArgType::Tensor_4D_t_Image: + { + const size_t image_w = tensor->info()->dimension(0) / 4; + const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1); + const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; + + cl::Image2D tensor_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), + TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y); + + _kernel.setArg(idx++, tensor_cl_image); + add_4d_tensor_nhwc_argument(idx, tensor); + break; + } default: { ARM_COMPUTE_ERROR("Unsupported"); @@ -140,6 +154,7 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w Window slice_fixed_z = slice; slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1)); + unsigned int idx = 0; do { @@ -162,7 +177,7 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w bool use_dummy_work_items = false; enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items); } - while(window.slide_window_slice_3D(slice)); + while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice)); } Status bind_arguments(ITensorPack &, const ClKernelCode &, const TensorBinding &) diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index 22bf850d6e..18fd52232d 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -141,6 +141,13 @@ void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_ _cl_tuner = cl_tuner; } +void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +{ + _is_initialised = false; + + default_init(cl_tuner, gemm_h, cl_backend_type); +} + void CLScheduler::set_context(cl::Context context) { _context = std::move(context); diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp index cb365901da..9e1b4d897b 100644 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp @@ -32,8 +32,10 @@ #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" #include "tests/validation/Validation.h" +#include "tests/validation/reference/ConvolutionLayer.h" #include "tests/validation/reference/ElementwiseOperations.h" #include "tests/validation/reference/GEMM.h" +#include "tests/validation/reference/Permute.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/AccessWindowStatic.h" @@ -83,7 +85,7 @@ TEST_SUITE(DYNAMIC_FUSION) TEST_SUITE(ClCompositeKernel) TEST_SUITE(Validate) -TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) +TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) { /* Computation: * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast) @@ -100,11 +102,11 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32); auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 }; + const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; + const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; + const ClTensorDescriptor t_bias_desc{ &t_bias_info }; + const ClTensorDescriptor t_addend_desc{ &t_dst_info }; + const ClTensorDescriptor t_dst_desc{ &t_dst_info }; ClKernelBlueprint bp; ArgumentID tid_lhs; @@ -134,10 +136,10 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) st = set_tile_info(bp, store_tile_info); st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - ClExecutionDescriptor exec_desc; + ClExecutionDescriptor exec_desc{}; st = tune_static(exec_desc, cl_code); - CLScheduler::get().default_init(); + CLScheduler::get().default_reinit(); ClCompositeKernel kernel; kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); @@ -193,10 +195,149 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32); } +TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) +{ + /* Computation: + * out = add(addend, direct_conv2d(lhs, rhs, bias)) (non-broadcast) + */ + + ClCompositeKernel kernel{}; + ClKernelBlueprint bp{}; + ClKernelCode cl_code{}; + ClExecutionDescriptor exec_desc{}; + Status st{}; + + const auto data_type = DataType::F32; + const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U); + + const auto width = 7U; + const auto height = 6U; + const auto IFM = 5U; + const auto OFM = 4U; + const auto kernel_sz = 3U; + + const auto src_shape = TensorShape(IFM, width, height); + const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); + const auto bia_shape = TensorShape(OFM); + const auto dst_shape = TensorShape(OFM, width, height); + + auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); + auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); + auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); + auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); + + const auto src_desc = ClTensorDescriptor(&src_info); + const auto wei_desc = ClTensorDescriptor(&wei_info); + const auto bia_desc = ClTensorDescriptor(&bia_info); + const auto addend_desc = ClTensorDescriptor(&dst_info); + const auto dst_desc = ClTensorDescriptor(&dst_info); + + const auto n0 = std::min(OFM, 4u); + const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U; + + const ClKernelComponentDescriptor common_kernel_desc{}; + const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info }; + const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; + const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; + + ArgumentID src_id{ g_arg_placeholder }; + ArgumentID wei_id{ g_arg_placeholder }; + ArgumentID bia_id{ g_arg_placeholder }; + ArgumentID acc_id{ g_arg_placeholder }; + ArgumentID addend_id{ g_arg_placeholder }; + ArgumentID dst_id{ g_arg_placeholder }; + + st = add_tensor_argument(bp, src_desc, src_id); + st = add_tensor_argument(bp, wei_desc, wei_id); + st = add_tensor_argument(bp, bia_desc, bia_id); + st = add_tensor_intermed(bp, acc_id); + st = add_tensor_argument(bp, addend_desc, addend_id); + st = add_tensor_argument(bp, dst_desc, dst_id); + + st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); + st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id); + st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect); + + exec_desc.skip_sliding_window = true; + + st = set_tile_info(bp, store_tile_info); + st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); + st = tune_static(exec_desc, cl_code); + + CLScheduler::get().default_reinit(); + kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); + + // Construct tensors + CLTensor src{}; + CLTensor wei{}; + CLTensor bia{}; + CLTensor addend{}; + CLTensor dst{}; + + // Init tensors + src.allocator()->init(src_info); + wei.allocator()->init(wei_info); + bia.allocator()->init(bia_info); + addend.allocator()->init(dst_info); + dst.allocator()->init(dst_info); + + // "Pack" tensors + TensorBinding tensors({ { src_id, &src }, + { wei_id, &wei }, + { bia_id, &bia }, + { addend_id, &addend }, + { dst_id, &dst } + }); + + // Allocate and fill tensors + src.allocator()->allocate(); + wei.allocator()->allocate(); + bia.allocator()->allocate(); + addend.allocator()->allocate(); + dst.allocator()->allocate(); + + fill(CLAccessor(src), 0); + fill(CLAccessor(wei), 1); + fill(CLAccessor(bia), 2); + fill(CLAccessor(addend), 3); + + CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); + + // Create reference + SimpleTensor ref_src_nhwc{ src_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_wei_nhwc{ wei_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_bia_nhwc{ bia_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + + // Fill reference + fill(ref_src_nhwc, 0); + fill(ref_wei_nhwc, 1); + fill(ref_bia_nhwc, 2); + fill(ref_addend_nhwc, 3); + + auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U)); + auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U)); + auto ref_bia = reference::permute(ref_bia_nhwc, PermutationVector(1U, 2U, 0U)); + auto ref_addend = reference::permute(ref_addend_nhwc, PermutationVector(1U, 2U, 0U)); + + TensorShape dst_shape_nchw{ dst_shape }; + permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U)); + + const auto ref_dst = reference::arithmetic_operation( + ArithmeticOperation::ADD, + ref_addend, + reference::convolution_layer(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info), + data_type, + eltwise_add_desc.convert_policy); + + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ + validate(CLAccessor(dst), ref_dst, tolerance_f32); +} + TEST_SUITE_END() // Validate TEST_SUITE(Benchmark) -TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) +TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) { using std::chrono::duration_cast; using std::chrono::microseconds; @@ -205,19 +346,19 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) /* Computation: * out = add(addend, gemm_native(lhs, rhs, bias)) */ - const auto data_type = DataType::F32; - const unsigned int m = 12 * 12; - const unsigned int n = 64; - const unsigned int k = 384; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, data_type); - auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3 - auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); + const auto data_type = DataType::F32; + const auto m = 12U * 12U; + const auto n = 64U; + const auto k = 384U; + const auto t_lhs_shape = TensorShape(k, m); + const auto t_rhs_shape = TensorShape(n, k); + const auto t_dst_shape = TensorShape(n, m); + auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); + auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); + auto t_bias_info = TensorInfo(TensorShape(), 1, data_type); + auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3 + auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type); + auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); const auto common_kernel_desc = ClKernelComponentDescriptor{}; const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k }; @@ -242,7 +383,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) data_type, eltwise_add_desc.convert_policy); - CLScheduler::get().default_init(); + CLScheduler::get().default_reinit(); /* Condition 0: Dynamic Fused Kernel */ CLTensor cond0_t_dst{}; @@ -256,11 +397,11 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) ArgumentID tid_l1_addend; ArgumentID tid_dst; - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info, 2 }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info, 2 }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info, 2 }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info, 2 }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info, 2 }; + const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; + const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; + const ClTensorDescriptor t_bias_desc{ &t_bias_info }; + const ClTensorDescriptor t_addend_desc{ &t_dst_info }; + const ClTensorDescriptor t_dst_desc{ &t_dst_info }; ClKernelCode cl_code; TICK(cond0_build_time) @@ -282,7 +423,7 @@ TEST_CASE(MoveNet_SubGraph_1, framework::DatasetMode::ALL) TOCK(cond0_build_time, measurements) TICK(cond0_tune_time) - ClExecutionDescriptor exec_desc; + ClExecutionDescriptor exec_desc{}; st = tune_static(exec_desc, cl_code); TOCK(cond0_tune_time, measurements) -- cgit v1.2.1