From ea8d266515812c4dec936b2153ffd5335873e583 Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Thu, 20 May 2021 11:36:56 +0100 Subject: Enable unroll through pragma based on DDK version Change-Id: Id98a107d512369d3799961011a84e9cc4d99e775 Signed-off-by: Giorgio Arena Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5679 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/CLCompileContext.cpp | 15 +++++++++++++++ src/core/CL/cl_kernels/tile_helpers.h | 13 ++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index 3db0fe515a..bf3a866e4b 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -29,6 +29,8 @@ #include "arm_compute/core/Utils.h" #include "support/StringSupport.h" +#include + namespace arm_compute { CLBuildOptions::CLBuildOptions() @@ -263,6 +265,19 @@ std::string CLCompileContext::generate_build_options(const StringSet &build_opti ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); } + const GPUTarget arch = get_arch_from_target(_device.target()); + if(arch != GPUTarget::UNKNOWN && arch != GPUTarget::MIDGARD) + { + const std::string device_vers = _device.device_version(); + const std::regex ddk_regex("r([0-9]*)p[0-9]"); + std::smatch ddk_match; + + if(std::regex_search(device_vers, ddk_match, ddk_regex) && std::stoi(ddk_match[1]) >= 9) + { + concat_str += " -DUNROLL_WITH_PRAGMA "; + } + } + std::string build_options = stringify_set(build_options_set, kernel_path) + concat_str; return build_options; diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 4959c04448..f2d2f26cf2 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -70,6 +70,7 @@ #define TENSOR4D_STR(name, type) TENSOR4D_##type(name) #define TENSOR4D(name, type) TENSOR4D_STR(name, type) +#if !defined(UNROLL_WITH_PRAGMA) #define UNROLL_INCR(idx, step, macro) idx += (step); (macro) #define LOOP_UNROLLING_1(idx, step, macro) (macro) @@ -201,12 +202,22 @@ #define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro) #define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro) -#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) #define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ { \ type idx = start; \ LOOP_UNROLLING_##num(idx, step, macro); \ } +#else // !defined(UNROLL_WITH_PRAGMA) +#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ + { \ + _Pragma("unroll") \ + for(type idx = start; idx < (num * step); idx += step) \ + { \ + (macro); \ + } \ + } +#endif // !defined(UNROLL_WITH_PRAGMA) +#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) /** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0 * to avoid out-of-bound read/write -- cgit v1.2.1