aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl
diff options
context:
space:
mode:
authorJakub Sujak <jakub.sujak@arm.com>2023-04-17 12:08:48 +0100
committerOmar Al Khatib <omar.alkhatib@arm.com>2023-05-05 14:48:28 +0000
commite9b3ee2badebf91188c1cd0e59d6aaa30ed60985 (patch)
tree750c39df7c0113caf6a893bb6af6e9ef1ecc3756 /src/gpu/cl
parentedafe7f5fdc056fddc395c70420fc869dcb7d9fb (diff)
downloadComputeLibrary-e9b3ee2badebf91188c1cd0e59d6aaa30ed60985.tar.gz
Connect CLMatMul function to quantized kernels and resolve NE BatchMatMul int_8 failures
* Adapt the CLMatMul function and ClMatMul operator to use quantized kernels. * Add function-level tests. Resolves: COMPMID-5929 and COMPMID-5811 Change-Id: I5348cdcf07b8074c138e04dfef0a73399377accd Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Signed-off-by: Omar Al Khatib <omar.alkhatib@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9575 Reviewed-by: Mohmun02 <MohammedSuhail.Munshi@arm.com> Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl')
-rw-r--r--src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h6
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.cpp41
-rw-r--r--src/gpu/cl/kernels/ClMatMulNativeKernel.h6
-rw-r--r--src/gpu/cl/operators/ClMatMul.cpp62
-rw-r--r--src/gpu/cl/operators/ClMatMul.h38
5 files changed, 97 insertions, 56 deletions
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index 13a33fbd62..d70ff30b91 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -48,17 +48,17 @@ public:
* Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
* @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
* Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[out] output Output tensor info. Data type supported: same as @p lhs
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
* @param[in] matmul_info Attributes for Batch MatMul Kernel
*/
- void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+ void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMulLowpNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 47dba22e8f..8f53c1998f 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -119,35 +119,36 @@ ClMatMulNativeKernel::ClMatMulNativeKernel()
{
_type = CLKernelType::GEMM;
}
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+
+Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
- if(output->total_size() != 0)
+ if(dst->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output);
+ const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
}
return Status{};
}
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info);
- ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info);
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info));
- // output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+ // dst tensor auto initialization if not yet initialized
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
- const int m = output->dimension(1);
- const int n = output->dimension(0);
+ const int m = dst->dimension(1);
+ const int n = dst->dimension(0);
const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
const bool adj_lhs = matmul_kernel_info.adj_lhs;
@@ -157,7 +158,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
_export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings();
// Configure kernel window
- Window win = calculate_max_window(*output, Steps(n0, m0));
+ Window win = calculate_max_window(*dst, Steps(n0, m0));
win = win.collapse(win, Window::DimZ);
IClKernel::configure_internal(win);
@@ -201,7 +202,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT
_config_id += "_";
_config_id += support::cpp11::to_string(k);
_config_id += "_";
- _config_id += support::cpp11::to_string(output->dimension(2));
+ _config_id += support::cpp11::to_string(dst->dimension(2));
_config_id += "_";
_config_id += support::cpp11::to_string(_export_rhs_to_cl_image);
_config_id += "_";
@@ -219,9 +220,9 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
- ICLTensor *output = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output);
+ ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst);
unsigned int idx = 0;
Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
@@ -242,7 +243,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl
}
add_3d_tensor_nhw_argument(idx, rhs);
- add_3d_tensor_nhw_argument(idx, output);
+ add_3d_tensor_nhw_argument(idx, dst);
enqueue(queue, *this, window_collapsed, lws_hint());
}
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index 50aa3b70e4..f706256e31 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -47,17 +47,17 @@ public:
* Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
* @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs.
* Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
- * @param[out] output Output tensor info. Data type supported: same as @p lhs
+ * @param[out] dst Output tensor info. Data type supported: same as @p lhs
* @param[in] matmul_info Attributes for Batch MatMul Kernel
*/
- void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+ void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMulNativeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 15833216bb..3822c16aa1 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -22,8 +22,11 @@
* SOFTWARE.
*/
#include "src/gpu/cl/operators/ClMatMul.h"
+
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
@@ -37,45 +40,74 @@ namespace arm_compute
namespace opencl
{
using namespace arm_compute::opencl::kernels;
+
ClMatMul::ClMatMul()
- : _native_matmul_kernel(std::make_unique<ClMatMulNativeKernel>())
+ : _matmul_native_kernel(std::make_unique<ClMatMulNativeKernel>()),
+ _matmul_lowp_native_kernel(std::make_unique<ClMatMulLowpNativeKernel>())
{
}
-ClMatMul::~ClMatMul()
-{
-}
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info)
+
+Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info)
{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+
const GPUTarget gpu_target = CLScheduler::get().target();
std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
- return ClMatMulNativeKernel::validate(lhs, rhs, output, kernel_info);
+ bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+
+ return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) :
+ ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info);
}
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info)
+
+void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
- ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_info);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_info));
+ ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info));
+
+ _is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+
const GPUTarget gpu_target = CLScheduler::get().target();
std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
- // Set the target for the kernels
- _native_matmul_kernel->set_target(gpu_target);
+ if(_is_quantized)
+ {
+ _matmul_lowp_native_kernel->set_target(gpu_target);
- // Configure the native matrix multiply kernel
- _native_matmul_kernel->configure(compile_context, lhs, rhs, output, kernel_info);
+ // Configure the low-precision native matrix multiply kernel
+ _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+ }
+ else
+ {
+ _matmul_native_kernel->set_target(gpu_target);
+
+ // Configure the native matrix multiply kernel
+ _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info);
+ }
}
+
void ClMatMul::run(ITensorPack &tensors)
{
- CLScheduler::get().enqueue_op(*_native_matmul_kernel, tensors, true);
+ if(_is_quantized)
+ {
+ CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
+ }
+ else
+ {
+ CLScheduler::get().enqueue_op(*_matmul_native_kernel, tensors, true);
+ }
}
+
} // namespace opencl
} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 20beda91ce..3d9863266e 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,11 +21,13 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul
-#define ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul
+#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
+#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL
#include "src/gpu/cl/IClOperator.h"
#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+
#include <memory>
namespace arm_compute
@@ -41,17 +43,20 @@ class ClMatMul : public IClOperator
public:
/** Constructor */
ClMatMul();
- ~ClMatMul();
+ /** Default destructor */
+ ~ClMatMul() = default;
/** Initialise the kernel's inputs and output
*
* Valid data layouts:
* - All
*
* Valid data type configurations:
- * |lhs |rhs |output |
- * |:------------|:------------|:------------|
- * |F32 |F32 |F32 |
- * |F16 |F16 |F16 |
+ * |lhs |rhs |dst |
+ * |:--------------|:--------------|:--------------|
+ * |F32 |F32 |F32 |
+ * |F16 |F16 |F16 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ * |QASYMM8 |QASYMM8 |QASYMM8 |
*
* @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
* and stores the result in the dst tensor of the same batch size.
@@ -60,25 +65,28 @@ public:
* @note All tensors must have the same data type.
*
* @param[in] compile_context The compile context to be used.
- * @param[in] lhs LHS input tensor info (Matrix A). Data types supported: F16/F32
- * @param[in] rhs RHS input tensor info (Matrix B). Data types supported: same as @p lhs.
- * @param[out] output Output tensor info. Data types supported: same as @p lhs
- * @param[in] matmul_info Attributes for MatMul
+ * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+ * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs.
+ * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+ * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo.
*/
- void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info);
+ void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref ClMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info);
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
private:
- std::unique_ptr<kernels::ClMatMulNativeKernel> _native_matmul_kernel;
+ std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{nullptr};
+ std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
+
+ bool _is_quantized{ false };
};
} // namespace opencl
} // namespace arm_compute
-#endif // ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul
+#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */