From 6c95c2dd574ebc3217c949a17016eb071935bc3b Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 20 Aug 2018 16:06:58 +0100 Subject: COMPMID-1188: Static tuning of CLScale Change-Id: Icf1cc00d9861fdb8766d0b8fd33ca90833863927 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/144830 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- arm_compute/core/CL/kernels/CLScaleKernel.h | 13 ++++++++++++ src/core/CL/kernels/CLScaleKernel.cpp | 15 ++++++++++++-- src/runtime/CL/functions/CLScale.cpp | 5 +++++ src/runtime/CL/tuners/BifrostTuner.cpp | 32 +++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h index b089e8f61a..db87519c0f 100644 --- a/arm_compute/core/CL/kernels/CLScaleKernel.h +++ b/arm_compute/core/CL/kernels/CLScaleKernel.h @@ -58,10 +58,23 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER); + /** Input tensor accessor. + * + * @return Pointer to input tensor. + */ + const ICLTensor *input() const; + /** Output tensor accessor. + * + * @return Pointer to output tensor. + */ + const ICLTensor *output() const; // Inherited methods overridden: BorderSize border_size() const override; void run(const Window &window, cl::CommandQueue &queue) override; + +public: + InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLSCALEKERNEL_H__ */ diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp index b2cd4b7adf..d56d6f7da8 100644 --- a/src/core/CL/kernels/CLScaleKernel.cpp +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -148,10 +148,21 @@ Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *outp return Status{}; } +const ICLTensor *CLScaleKernel::input() const +{ + return _input; +} + +const ICLTensor *CLScaleKernel::output() const +{ + return _output; +} + void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy) { - _input = input; - _output = output; + _input = input; + _output = output; + _interpolationPolicy = policy; ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy)); diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp index 7ef55f9f08..4ff9763397 100644 --- a/src/runtime/CL/functions/CLScale.cpp +++ b/src/runtime/CL/functions/CLScale.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/kernels/CLScaleKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" #include "support/ToolchainSupport.h" using namespace arm_compute; @@ -34,9 +35,13 @@ using namespace arm_compute; void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy) { auto k = arm_compute::support::cpp14::make_unique(); + k->set_target(CLScheduler::get().target()); k->configure(input, output, policy, border_mode, sampling_policy); _kernel = std::move(k); + // Tune kernels + CLScheduler::get().tune_kernel_static(*_kernel); + // In the case of NHWC we can't have undefined border mode as this would require to access elements outside z dimension, // so we treat it like border constant. if(border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC) diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp index fa67710cc8..2d52f3392e 100644 --- a/src/runtime/CL/tuners/BifrostTuner.cpp +++ b/src/runtime/CL/tuners/BifrostTuner.cpp @@ -249,6 +249,34 @@ void tune_pooling_kernel(CLPoolingLayerKernel &k) k.set_lws_hint(lws_hint); } + +void tune_scale_kernel(CLScaleKernel &k) +{ + cl::NDRange lws_hint = k.lws_hint(); + const GPUTarget gpu_target = k.get_target(); + const DataType dt = k.input()->info()->data_type(); + const InterpolationPolicy interpolation = k._interpolationPolicy; + + // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32. + // The value are obtained via exhaustive autotuning. + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR)) + { + auto dim_0 = k.output()->info()->dimension(0); + if(dim_0 == 480) + { + lws_hint = cl::NDRange(2, 1); + } + else if(dim_0 == 3120) + { + lws_hint = cl::NDRange(2, 8); + } + else if(dim_0 == 4160) + { + lws_hint = cl::NDRange(4, 8); + } + k.set_lws_hint(lws_hint); + } +} } // namespace void BifrostTuner::tune_kernel_static(ICLKernel &kernel) @@ -281,6 +309,10 @@ void BifrostTuner::tune_kernel_static(ICLKernel &kernel) { tune_pooling_kernel(*utils::cast::polymorphic_downcast(&kernel)); } + else if(dynamic_cast(&kernel) != nullptr) + { + tune_scale_kernel(*utils::cast::polymorphic_downcast(&kernel)); + } } void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel) -- cgit v1.2.1