diff options
author | SiCong Li <sicong.li@arm.com> | 2022-01-28 18:24:39 +0000 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2022-05-06 15:01:45 +0000 |
commit | b63b1196adea8b07dd8db77c2492a212650deba0 (patch) | |
tree | b264035197873f56c69784bec68cad7041b5d423 /src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp | |
parent | 3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 (diff) | |
download | ComputeLibrary-b63b1196adea8b07dd8db77c2492a212650deba0.tar.gz |
Integrate Dynamic Fusion patches
* Add public interfaces:
* OperatorGraph: Describe a workload that could contain fused kernels
* IWorkload: Generic interface for workloads built from OperatorGraph
* ClWorkload: OpenCL workloads built from OperatorGraph
* ClCompositeOperator: Runtime async operator to execute a ClWorkload
* DependencyGraph (will likely be deprecated in later iterations)
* Add example
* cl_fused_conv2d_elementwise_add.cpp to explain how to use the new
interfaces
* Add internal translation layer
* Refactor ClKernelBuildingAPI
* Remove non-tile based gemm native kernel component
* Minor interface changes
* Add integration tests
Resolves COMPMID-5161
Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp')
-rw-r--r-- | src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp | 202 |
1 files changed, 107 insertions, 95 deletions
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp index f951ce3d46..11fb1d53d0 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" @@ -31,6 +33,7 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" namespace arm_compute { namespace experimental @@ -44,7 +47,7 @@ ComponentType ClDirectConvolutionKernelComponent::get_component_type() const std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const { - return std::set<std::string> { "helpers.h", "tile_helpers.h", "repeat.h" }; + return std::set<std::string> { "helpers.h", "tile_helpers.h" }; } Window ClDirectConvolutionKernelComponent::get_window() const @@ -54,7 +57,17 @@ Window ClDirectConvolutionKernelComponent::get_window() const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info); + PadStrideInfo pad_stride_info + { + static_cast<unsigned int>(_desc.conv2d.stride.x()), + static_cast<unsigned int>(_desc.conv2d.stride.y()), + static_cast<unsigned int>(_desc.conv2d.pad.left), + static_cast<unsigned int>(_desc.conv2d.pad.right), + static_cast<unsigned int>(_desc.conv2d.pad.top), + static_cast<unsigned int>(_desc.conv2d.pad.bottom), + DimensionRoundingType::FLOOR /*default rounding type*/ + }; + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info); // Output auto initialization if not yet initialized auto_init_if_empty(*dst_info, output_shape, @@ -64,6 +77,9 @@ Window ClDirectConvolutionKernelComponent::get_window() const const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u); const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U; + // const unsigned int num_rows = 1; + // const unsigned int vec_size = tile_info.tile_dims.x(); + // const unsigned int num_rows = tile_info.tile_dims.y(); // Create and configure kernel window Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); @@ -95,27 +111,30 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const //------------------ START KERNEL {{meta_kernel_id}} --------------------- // IN_0(src) {{src}} // IN_1(wei) {{weight}} + )_"; + if(bias_info != nullptr) + { + code += R"_( // IN_1(bia) {{bias}} + )_"; + } + code += R"_( // OUT(dst, accum) {{dst}} - const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM - const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - // Initialize the accumulators TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); { // All the tensor dimensions are passed at compile time. // In case of dynamic tensor support, the following dimensions should be passed as function argument. - #define _I{{WEI_WIDTH}} {{WEI_WIDTH}} - #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}} + #define _IWEI_WIDTH {{WEI_WIDTH}} + #define _IWEI_HEIGHT {{WEI_HEIGHT}} #define _ISRC_WIDTH {{src}}_w #define _ISRC_HEIGHT {{src}}_h #define _ISRC_CHANNELS {{src}}_c - #define _IDST_WIDTH {{dst_w}} - #define _IDST_HEIGHT {{dst_h}} - #define _IDST_CHANNELS {{dst_c}} - #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}) + #define _IDST_WIDTH {{arg_dst}}_w + #define _IDST_HEIGHT {{arg_dst}}_h + #define _IDST_CHANNELS {{arg_dst}}_c + #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // .v = access the whole vector (OpenCL vector) // .s[x] = access the vector element at position x (scalar access) @@ -136,13 +155,11 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const {{dst}}[i].v = 0; }) - uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0); - - for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i) + for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { int ck = 0; - int xk = i % _I{{WEI_WIDTH}}; - int yk = i / _I{{WEI_WIDTH}}; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_HEIGHT; int k = 0; for(; k <= (_ISRC_CHANNELS - K0); k += K0) @@ -201,6 +218,16 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const } code += R"_( + #undef _I_WEI_WIDTH + #undef _I_WEI_HEIGHT + #undef _ISRC_WIDTH + #undef _ISRC_HEIGHT + #undef _ISRC_CHANNELS + #undef _IDST_WIDTH + #undef _IDST_HEIGHT + #undef _IDST_CHANNELS + #undef _IY_MULTIPLIER + } )_"; @@ -217,44 +244,7 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const } code += R"_( - #undef _I{{WEI_WIDTH}} - #undef _I{{WEI_HEIGHT}} - #undef _ISRC_WIDTH - #undef _ISRC_HEIGHT - #undef _ISRC_CHANNELS - #undef _IDST_WIDTH - #undef _IDST_HEIGHT - #undef _IDST_CHANNELS - #undef _IY_MULTIPLIER } - - // Workaround for the discrepancy between tiles and repeats - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v; -#if M0 >= 2 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v; -#endif // M0 >= 2 -#if M0 >= 3 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v; -#endif // M0 >= 3 -#if M0 >= 4 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v; -#endif // M0 >= 4 -#if M0 >= 8 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v; -#endif // M0 >= 8 -#if M0 == 16 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v; -#endif // M0 == 16 //------------------ END KERNEL {{meta_kernel_id}} --------------------- )_"; return code.c_str(); @@ -306,19 +296,18 @@ bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const { const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + // const auto tile_info = _blueprint->impl().get_tile_info(); const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); const DataType data_type = src_info->data_type(); - const GPUTarget gpu_target = ICLKernel().get_target(); - - Window win = get_window(); + const GPUTarget gpu_target = CLScheduler::get().target(); - const unsigned int n0 = win.x().step(); - const unsigned int m0 = win.y().step(); + const unsigned int n0 = _blueprint->impl().get_execution_window().x().step(); + const unsigned int m0 = _blueprint->impl().get_execution_window().y().step(); const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0; + const unsigned int partial_store_n0 = dst_info->dimension(0) % n0; const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); // Update the padding for the weights tensor if we can export to cl_image @@ -338,54 +327,79 @@ CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() cons return build_opts; } -ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); + + const GPUTarget gpu_target = CLScheduler::get().target(); + const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); + const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer; + vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight"); + + if(!_bias.is_empty()) // optional bias + { + vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias"); + } + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); +} + +ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { TagLUT lut{}; const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - const GPUTarget gpu_target = ICLKernel().get_target(); - const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); - const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer; - lut["meta_kernel_id"] = id(); - lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src"); - lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight"); + // Arguments and global shared variables + lut["src"] = vtable.get(_src); + lut["weight"] = vtable.get(_weight); if(!_bias.is_empty()) // optional bias { - lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias"); + lut["bias"] = vtable.get(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type()); } - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); - - // Local build options - const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); - const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); - const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); + lut["dst"] = vtable.get(_dst); - lut["dst_w"] = dst_info->dimension(width_idx); - lut["dst_h"] = dst_info->dimension(height_idx); - lut["dst_c"] = dst_info->dimension(channel_idx); + const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var(); + lut["arg_dst"] = dst_argument.uniq_name; - lut["ACC_DATA_TYPE"] = src_info->data_type(); - lut["SRC_DATA_TYPE"] = src_info->data_type(); - lut["WEI_DATA_TYPE"] = weight_info->data_type(); + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = src_info->data_type(); + lut["SRC_DATA_TYPE"] = src_info->data_type(); + lut["WEI_DATA_TYPE"] = weight_info->data_type(); lut["SRC_TENSOR_TYPE"] = "BUFFER"; - lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER"; - - lut["WEI_WIDTH"] = weight_info->dimension(width_idx); - lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); + switch(vtable.get(_weight).desc.tensor_arg_type) + { + case ClKernelTensorArgType::Image_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: + case ClKernelTensorArgType::Tensor_4D_t_Image: + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } + default: + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + } + const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); + const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); + lut["WEI_WIDTH"] = weight_info->dimension(width_idx); + lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); - lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride()); - lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride()); + lut["STRIDE_X"] = _desc.conv2d.stride.x(); + lut["STRIDE_Y"] = _desc.conv2d.stride.y(); - lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left(); - lut["PAD_TOP"] = _desc.pad_stride_info.pad_top(); + lut["PAD_LEFT"] = _desc.conv2d.pad.left; + lut["PAD_TOP"] = _desc.conv2d.pad.top; lut["ZERO_VALUE"] = 0; @@ -393,6 +407,4 @@ ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::a } } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file |