From b0c5037d94ba7073ccabb0ebaff54db320f184c4 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 15 Mar 2019 10:13:05 +0000 Subject: COMPMID-2043: Add support for "dummy threads" in CLGEMMReshaped Change-Id: I89403b97503fbb99f6a32f5d62b8c535ab26a7be Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/877 Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- arm_compute/core/CL/CLHelpers.h | 11 ++++++++- arm_compute/core/CL/ICLKernel.h | 14 ++++++----- .../CLGEMMLowpMatrixMultiplyReshapedKernel.h | 1 + .../kernels/CLGEMMMatrixMultiplyReshapedKernel.h | 1 + .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h | 1 + arm_compute/core/Helpers.h | 28 ++++++++++++++++++++++ 6 files changed, 49 insertions(+), 7 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index 78427c3738..16fe09fb96 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -144,5 +144,14 @@ bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Si * @return preferred vector width */ size_t preferred_vector_width(const cl::Device &device, DataType dt); + +/** Helper function to check if "dummy work-items" are preferred to have a power of two NDRange + * In case dummy work-items is enabled, it is OpenCL kernel responsibility to check if the work-item is out-of range or not + * + * @param[in] device A CL device + * + * @return True if dummy work-items should be preferred to dispatch the NDRange + */ +bool preferred_dummy_work_items_support(const cl::Device &device); } #endif /* __ARM_COMPUTE_CLHELPERS_H__ */ diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h index f5423584e1..590f8929cb 100644 --- a/arm_compute/core/CL/ICLKernel.h +++ b/arm_compute/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018 ARM Limited. + * Copyright (c) 2016-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -308,14 +308,16 @@ private: * * @note If kernel->kernel() is empty then the function will return without adding anything to the queue. * - * @param[in,out] queue OpenCL command queue. - * @param[in] kernel Kernel to enqueue - * @param[in] window Window the kernel has to process. - * @param[in] lws_hint Local workgroup size requested. Default is based on the device target. + * @param[in,out] queue OpenCL command queue. + * @param[in] kernel Kernel to enqueue + * @param[in] window Window the kernel has to process. + * @param[in] lws_hint (Optional) Local workgroup size requested. Default is based on the device target. + * @param[in] use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false + * Note: it is kernel responsibility to check if the work-item is out-of-range * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ -void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange()); +void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false); /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h index 1cf7236446..d361236380 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h @@ -81,6 +81,7 @@ private: bool _slide_matrix_b; bool _reinterpret_output_as_3d; unsigned int _k; + bool _use_dummy_work_items; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H__*/ \ No newline at end of file diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h index cb23b969dd..b0d245f342 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h @@ -84,6 +84,7 @@ private: bool _slide_matrix_b; bool _reinterpret_output_as_3d; unsigned int _k; + bool _use_dummy_work_items; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H__*/ \ No newline at end of file diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h index 74715949f2..b3ee43555a 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h @@ -84,6 +84,7 @@ private: bool _slide_matrix_b; bool _reinterpret_input_as_3d; bool _reinterpret_output_as_3d; + bool _use_dummy_work_items; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H__*/ \ No newline at end of file diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h index 91d85be086..c7c7110ef5 100644 --- a/arm_compute/core/Helpers.h +++ b/arm_compute/core/Helpers.h @@ -756,6 +756,34 @@ inline T wrap_around(T x, T m) { return x >= 0 ? x % m : (x % m + m) % m; } + +/** Given an integer value, this function returns the next power of two + * + * @param[in] x Input value + * + * @return the next power of two + */ +inline unsigned int get_next_power_two(unsigned int x) +{ + // Decrement by 1 + x--; + + // Shift right by 1 + x |= x >> 1u; + // Shift right by 2 + x |= x >> 2u; + // Shift right by 4 + x |= x >> 4u; + // Shift right by 8 + x |= x >> 8u; + // Shift right by 16 + x |= x >> 16u; + + // Increment by 1 + x++; + + return x; +} } // namespace arm_compute #include "arm_compute/core/Helpers.inl" -- cgit v1.2.1