From ba8690b4f0f82dfb66e3da819361e4032e9fa4db Mon Sep 17 00:00:00 2001 From: ramelg01 Date: Fri, 4 Feb 2022 11:37:00 +0000 Subject: Improve start-up time for winograd_filter_transform_*_nhwc - pass tensor's dimensions at runtime rather than compile time - Add guard macro to compile only kernel of internest Resolves: COMPMID-5118 Signed-off-by: Ramy Elgammal Change-Id: Ie42c3c07fdd817ce62e7cad354381bc22c6e9264 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7058 Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Comments-Addressed: Arm Jenkins --- .../cl_kernels/nhwc/winograd_filter_transform.cl | 90 +++++++++++++++------- .../cl/kernels/ClWinogradFilterTransformKernel.cpp | 24 +++++- .../cl/kernels/ClWinogradFilterTransformKernel.h | 5 +- 3 files changed, 85 insertions(+), 34 deletions(-) diff --git a/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl index 8d5fd3437f..45fbc1b641 100644 --- a/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl +++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,8 +23,6 @@ */ #include "helpers.h" -#if defined(SRC_DIM_Z) - #define OUTPUT_ROW_2x2_7x7(out, tmp) \ ({ \ out.s0 = -tmp.s0 / 36.f; \ @@ -37,9 +35,9 @@ out.s7 = tmp.s6; \ }) +#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC) /** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. @@ -62,10 +60,12 @@ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_4x4_3x3_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); @@ -194,10 +194,11 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc( *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55; #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC) +#if defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC) /** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. @@ -220,10 +221,12 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_4x4_5x5_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); @@ -473,9 +476,12 @@ __kernel void winograd_filter_transform_4x4_5x5_nhwc( *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC) + +#if defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC) + /** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. @@ -498,10 +504,12 @@ __kernel void winograd_filter_transform_4x4_5x5_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_2x2_7x7_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); @@ -773,13 +781,13 @@ __kernel void winograd_filter_transform_2x2_7x7_nhwc( *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) } -#endif // defined(SRC_DIM_Z) +#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC) #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) +#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) /** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. * @@ -801,10 +809,12 @@ __kernel void winograd_filter_transform_2x2_7x7_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_4x1_3x1_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { winograd_filter_transform_4x4_3x3_nhwc(src_ptr, src_stride_x, @@ -823,12 +833,14 @@ __kernel void winograd_filter_transform_4x1_3x1_nhwc( dst_step_y, dst_stride_z, dst_step_z, - dst_offset_first_element_in_bytes); + dst_offset_first_element_in_bytes, + SRC_DIM_Z); } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_3X1_NHWC) +#if defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) /** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. * @@ -850,10 +862,12 @@ __kernel void winograd_filter_transform_4x1_3x1_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_4x1_5x1_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { winograd_filter_transform_4x4_5x5_nhwc(src_ptr, src_stride_x, @@ -872,12 +886,14 @@ __kernel void winograd_filter_transform_4x1_5x1_nhwc( dst_step_y, dst_stride_z, dst_step_z, - dst_offset_first_element_in_bytes); + dst_offset_first_element_in_bytes, + SRC_DIM_Z); } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_4X1_5X1_NHWC) +#if defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) /** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float. * @@ -899,10 +915,12 @@ __kernel void winograd_filter_transform_4x1_5x1_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_2x1_7x1_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { winograd_filter_transform_2x2_7x7_nhwc(src_ptr, src_stride_x, @@ -921,14 +939,16 @@ __kernel void winograd_filter_transform_2x1_7x1_nhwc( dst_step_y, dst_stride_z, dst_step_z, - dst_offset_first_element_in_bytes); + dst_offset_first_element_in_bytes, + SRC_DIM_Z); } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_2X1_7X1_NHWC) #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC) /** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. * @@ -950,10 +970,12 @@ __kernel void winograd_filter_transform_2x1_7x1_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_1x4_1x3_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { winograd_filter_transform_4x4_3x3_nhwc(src_ptr, src_stride_x, @@ -972,12 +994,14 @@ __kernel void winograd_filter_transform_1x4_1x3_nhwc( dst_step_y, dst_stride_z, dst_step_z, - dst_offset_first_element_in_bytes); + dst_offset_first_element_in_bytes, + SRC_DIM_Z); } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X3_NHWC) +#if defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC) /** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. * @@ -999,10 +1023,12 @@ __kernel void winograd_filter_transform_1x4_1x3_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_1x4_1x5_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { winograd_filter_transform_4x4_5x5_nhwc(src_ptr, src_stride_x, @@ -1021,12 +1047,14 @@ __kernel void winograd_filter_transform_1x4_1x5_nhwc( dst_step_y, dst_stride_z, dst_step_z, - dst_offset_first_element_in_bytes); + dst_offset_first_element_in_bytes, + SRC_DIM_Z); } +#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X4_1X5_NHWC) +#if defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC) /** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2 * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float. * @@ -1048,10 +1076,12 @@ __kernel void winograd_filter_transform_1x4_1x5_nhwc( * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] SRC_DIM_Z The third (Z) dimension of the src tensor */ __kernel void winograd_filter_transform_1x2_1x7_nhwc( TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) + TENSOR3D_DECLARATION(dst), + const int SRC_DIM_Z) { winograd_filter_transform_2x2_7x7_nhwc(src_ptr, src_stride_x, @@ -1070,6 +1100,8 @@ __kernel void winograd_filter_transform_1x2_1x7_nhwc( dst_step_y, dst_stride_z, dst_step_z, - dst_offset_first_element_in_bytes); + dst_offset_first_element_in_bytes, + SRC_DIM_Z); } -#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) \ No newline at end of file +#endif // defined(WINOGRAD_FILTER_TRANSFORM_1X2_1X7_NHWC) +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp index 4ba6ba8a9a..136376a39f 100644 --- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -108,7 +108,16 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_ // Set build options CLBuildOptions build_opts; - build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2))); + + // For NHWC layouts pass tensor dimesions at runtime + if(src->data_layout() == DataLayout::NHWC) + { + _src_dim_z = src->dimension(2); + } + else + { + build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2))); + } build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL"); build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL"); @@ -117,7 +126,10 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_ // Create kernel std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // A macro guard to compile ONLY the kernel of interest + build_opts.add_option("-D" + upper_string(kernel_name)); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window auto win_config = validate_and_configure_window(src, dst); @@ -149,8 +161,12 @@ void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); add_3D_tensor_argument(idx, dst, window_out); + if(src->info()->data_layout() == DataLayout::NHWC) + { + _kernel.setArg(idx++, _src_dim_z); + } enqueue(queue, *this, window, lws_hint()); } } // namespace kernels } // namespace opencl -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h index fe0c3da174..b2130304e6 100644 --- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -70,6 +70,9 @@ public: // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + int32_t _src_dim_z{ 0 }; }; } // namespace kernels } // namespace opencl -- cgit v1.2.1