From e9b3ee2badebf91188c1cd0e59d6aaa30ed60985 Mon Sep 17 00:00:00 2001 From: Jakub Sujak Date: Mon, 17 Apr 2023 12:08:48 +0100 Subject: Connect CLMatMul function to quantized kernels and resolve NE BatchMatMul int_8 failures * Adapt the CLMatMul function and ClMatMul operator to use quantized kernels. * Add function-level tests. Resolves: COMPMID-5929 and COMPMID-5811 Change-Id: I5348cdcf07b8074c138e04dfef0a73399377accd Signed-off-by: Jakub Sujak Signed-off-by: Omar Al Khatib Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9575 Reviewed-by: Mohmun02 Reviewed-by: SiCong Li Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- arm_compute/runtime/CL/functions/CLMatMul.h | 29 +++--- arm_compute/runtime/NEON/functions/NEMatMul.h | 22 ++-- src/cpu/operators/CpuMatMul.cpp | 1 + src/cpu/operators/CpuMatMul.h | 12 +-- src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h | 6 +- src/gpu/cl/kernels/ClMatMulNativeKernel.cpp | 41 ++++---- src/gpu/cl/kernels/ClMatMulNativeKernel.h | 6 +- src/gpu/cl/operators/ClMatMul.cpp | 62 ++++++++--- src/gpu/cl/operators/ClMatMul.h | 38 ++++--- tests/validation/CL/MatMul.cpp | 143 +++++++++++++++++++++----- tests/validation/fixtures/MatMulFixture.h | 9 +- 11 files changed, 255 insertions(+), 114 deletions(-) diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h index 712bac06bf..2af9a4a9a6 100644 --- a/arm_compute/runtime/CL/functions/CLMatMul.h +++ b/arm_compute/runtime/CL/functions/CLMatMul.h @@ -21,11 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL -#define ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL +#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL +#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL #include "arm_compute/runtime/IFunction.h" #include + namespace arm_compute { // Forward declarations for used types instead of including their header, that could minimize compile time @@ -64,10 +65,12 @@ public: * - All * * Valid data type configurations: - * |lhs |rhs |output | - * |:------------|:------------|:--------------| - * |F32 |F32 |F32 | - * |F16 |F16 |F16 | + * |lhs |rhs |dst | + * |:--------------|:--------------|:--------------| + * |F32 |F32 |F32 | + * |F16 |F16 |F16 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QASYMM8 |QASYMM8 |QASYMM8 | * * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B * and stores the result in the dst tensor of the same batch size. @@ -76,18 +79,18 @@ public: * @note All tensors must have the same data type. * * @param[in] compile_context The compile context to be used. - * @param[in] lhs LHS input tensor (Matrix or Vector A). Data types supported: F16/F32 - * @param[in] rhs RHS input tensor (Matrix B). Data type supported: same as @p lhs. - * @param[out] output Output tensor. Data type supported: same as @p lhs. - * @param[in] matmul_info Attributes for MatMul + * @param[in] lhs Left-hand side tensor info containing the input activations as Matrix A. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs. + * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] settings Class containing flags for function level settings */ - void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}); + void configure(const CLCompileContext &compile_context, ICLTensor *rhs, ICLTensor *lhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}); /** Initialise the kernel's inputs and output * * Similar to @ref CLMatMul::configure() */ - void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}); + void configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *dst, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings = GpuMatMulSettings{}); /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul. * * Similar to @ref CLMatMul::configure() @@ -104,4 +107,4 @@ private: }; } // namespace arm_compute -#endif /* ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */ +#endif /* ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */ diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h index 0f3e3adacc..a331c55a98 100644 --- a/arm_compute/runtime/NEON/functions/NEMatMul.h +++ b/arm_compute/runtime/NEON/functions/NEMatMul.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL -#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL #include "arm_compute/runtime/IFunction.h" #include @@ -80,25 +80,27 @@ public: * - Any * * Valid data type configurations: - * |src0 |src1 |dst | + * |lhs |rhs |dst | * |:--------------|:------------------|:--------------| * |F32 |F32 |F32 | * |F16 |F16 |F16 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QASYMM8 |QASYMM8 |QASYMM8 | * - * @param[in] lhs Input source tensor. - * @param[in] rhs Input source tensor. - * @param[out] output Output tensor. Data type supported: same as @p lhs/rhs - * @param[in] info Class containing flags to transpose lhs/rhs + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] settings Class containing flags for function level settings i.e fast math */ - void configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings); + void configure(ITensor *lhs, ITensor *rhs, ITensor *dst, const MatMulInfo &info, const CpuMatMulSettings &settings); /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul * * Parameters are similar to @ref NEMatMul::configure() * * @return Status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings); // Inherited methods overridden void run() override; @@ -108,4 +110,4 @@ private: std::unique_ptr _impl; }; } -#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL */ +#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL */ diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index 64b5167ad0..87cb6c6b54 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -191,6 +191,7 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, // Fill AsmGemmInfo class object before configuration _gemm_info.activation_info = info.fused_activation(); _gemm_info.fast_mode = settings.fast_math(); + _gemm_info.negated_offsets = false; lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use; rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use; diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h index ae6345141e..9f5833b24f 100644 --- a/src/cpu/operators/CpuMatMul.h +++ b/src/cpu/operators/CpuMatMul.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_CPU_OPERATORS_CPUMATMUL -#define SRC_CPU_OPERATORS_CPUMATMUL +#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL +#define ACL_SRC_CPU_OPERATORS_CPUMATMUL #include "arm_compute/core/TensorInfo.h" #include "src/core/common/Macros.h" @@ -59,9 +59,9 @@ public: * Note: Check documentation of @ref NEMatMul for a list of supported datatypes and layouts * * - * @param[in] lhs Source tensor info. - * @param[in] rhs Source tensor info. - * @param[out] dst Destination tensor info. Data types supported: same as @p lhs / @p rhs. + * @param[in] lhs Left-hand side tensor info. + * @param[in] rhs Right-hand side tensor info. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] settings The settings for matmul operation (i.e fast math) */ @@ -112,4 +112,4 @@ private: } } -#endif /* SRC_CPU_OPERATORS_CPUMATMUL */ +#endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */ diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h index 13a33fbd62..d70ff30b91 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h @@ -48,17 +48,17 @@ public: * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[out] output Output tensor info. Data type supported: same as @p lhs + * @param[out] dst Output tensor info. Data type supported: same as @p lhs * @param[in] matmul_info Attributes for Batch MatMul Kernel */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info); + void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMulLowpNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp index 47dba22e8f..8f53c1998f 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp @@ -119,35 +119,36 @@ ClMatMulNativeKernel::ClMatMulNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) + +Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info)); - if(output->total_size() != 0) + if(dst->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output); + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } return Status{}; } -void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info); - ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_kernel_info); + ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_kernel_info)); - // output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); - const int m = output->dimension(1); - const int n = output->dimension(0); + const int m = dst->dimension(1); + const int n = dst->dimension(0); const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); const bool adj_lhs = matmul_kernel_info.adj_lhs; @@ -157,7 +158,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT _export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings(); // Configure kernel window - Window win = calculate_max_window(*output, Steps(n0, m0)); + Window win = calculate_max_window(*dst, Steps(n0, m0)); win = win.collapse(win, Window::DimZ); IClKernel::configure_internal(win); @@ -201,7 +202,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT _config_id += "_"; _config_id += support::cpp11::to_string(k); _config_id += "_"; - _config_id += support::cpp11::to_string(output->dimension(2)); + _config_id += support::cpp11::to_string(dst->dimension(2)); _config_id += "_"; _config_id += support::cpp11::to_string(_export_rhs_to_cl_image); _config_id += "_"; @@ -219,9 +220,9 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - ICLTensor *output = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output); + ICLTensor *dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst); unsigned int idx = 0; Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ); @@ -242,7 +243,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl } add_3d_tensor_nhw_argument(idx, rhs); - add_3d_tensor_nhw_argument(idx, output); + add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, window_collapsed, lws_hint()); } diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h index 50aa3b70e4..f706256e31 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h @@ -47,17 +47,17 @@ public: * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[out] output Output tensor info. Data type supported: same as @p lhs + * @param[out] dst Output tensor info. Data type supported: same as @p lhs * @param[in] matmul_info Attributes for Batch MatMul Kernel */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info); + void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulKernelInfo &matmul_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMulNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index 15833216bb..3822c16aa1 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -22,8 +22,11 @@ * SOFTWARE. */ #include "src/gpu/cl/operators/ClMatMul.h" + #include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" @@ -37,45 +40,74 @@ namespace arm_compute namespace opencl { using namespace arm_compute::opencl::kernels; + ClMatMul::ClMatMul() - : _native_matmul_kernel(std::make_unique()) + : _matmul_native_kernel(std::make_unique()), + _matmul_lowp_native_kernel(std::make_unique()) { } -ClMatMul::~ClMatMul() -{ -} -Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info) + +Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + const GPUTarget gpu_target = CLScheduler::get().target(); std::unique_ptr t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - return ClMatMulNativeKernel::validate(lhs, rhs, output, kernel_info); + bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); + + return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, dst, kernel_info) : + ClMatMulNativeKernel::validate(lhs, rhs, dst, kernel_info); } -void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info) + +void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info)); + + _is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); + const GPUTarget gpu_target = CLScheduler::get().target(); std::unique_ptr t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - // Set the target for the kernels - _native_matmul_kernel->set_target(gpu_target); + if(_is_quantized) + { + _matmul_lowp_native_kernel->set_target(gpu_target); - // Configure the native matrix multiply kernel - _native_matmul_kernel->configure(compile_context, lhs, rhs, output, kernel_info); + // Configure the low-precision native matrix multiply kernel + _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info); + } + else + { + _matmul_native_kernel->set_target(gpu_target); + + // Configure the native matrix multiply kernel + _matmul_native_kernel->configure(compile_context, lhs, rhs, dst, kernel_info); + } } + void ClMatMul::run(ITensorPack &tensors) { - CLScheduler::get().enqueue_op(*_native_matmul_kernel, tensors, true); + if(_is_quantized) + { + CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true); + } + else + { + CLScheduler::get().enqueue_op(*_matmul_native_kernel, tensors, true); + } } + } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index 20beda91ce..3d9863266e 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -21,11 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul -#define ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul +#ifndef ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL +#define ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" + #include namespace arm_compute @@ -41,17 +43,20 @@ class ClMatMul : public IClOperator public: /** Constructor */ ClMatMul(); - ~ClMatMul(); + /** Default destructor */ + ~ClMatMul() = default; /** Initialise the kernel's inputs and output * * Valid data layouts: * - All * * Valid data type configurations: - * |lhs |rhs |output | - * |:------------|:------------|:------------| - * |F32 |F32 |F32 | - * |F16 |F16 |F16 | + * |lhs |rhs |dst | + * |:--------------|:--------------|:--------------| + * |F32 |F32 |F32 | + * |F16 |F16 |F16 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QASYMM8 |QASYMM8 |QASYMM8 | * * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B * and stores the result in the dst tensor of the same batch size. @@ -60,25 +65,28 @@ public: * @note All tensors must have the same data type. * * @param[in] compile_context The compile context to be used. - * @param[in] lhs LHS input tensor info (Matrix A). Data types supported: F16/F32 - * @param[in] rhs RHS input tensor info (Matrix B). Data types supported: same as @p lhs. - * @param[out] output Output tensor info. Data types supported: same as @p lhs - * @param[in] matmul_info Attributes for MatMul + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs. + * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info); + void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; private: - std::unique_ptr _native_matmul_kernel; + std::unique_ptr _matmul_native_kernel{nullptr}; + std::unique_ptr _matmul_lowp_native_kernel{nullptr}; + + bool _is_quantized{ false }; }; } // namespace opencl } // namespace arm_compute -#endif // ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul +#endif /* ACL_ARM_COMPUTE_SRC_GPU_CL_OPERATORS_CLMATMUL */ diff --git a/tests/validation/CL/MatMul.cpp b/tests/validation/CL/MatMul.cpp index 7c1d16008f..6364b16200 100644 --- a/tests/validation/CL/MatMul.cpp +++ b/tests/validation/CL/MatMul.cpp @@ -21,14 +21,19 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLMatMul.h" + #include "tests/CL/CLAccessor.h" -#include "tests/datasets/LargeMatMulDataset.h" -#include "tests/datasets/SmallMatMulDataset.h" +#include "tests/framework/DatasetModes.h" #include "tests/framework/Macros.h" +#include "tests/framework/TestCase.h" #include "tests/framework/datasets/Datasets.h" #include "tests/validation/Validation.h" + +#include "tests/datasets/LargeMatMulDataset.h" +#include "tests/datasets/SmallMatMulDataset.h" #include "tests/validation/fixtures/MatMulFixture.h" namespace arm_compute @@ -39,55 +44,143 @@ namespace validation { namespace { -RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */ -constexpr float abs_tolerance_f32( - 0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */ -constexpr float abs_tolerance_f16( - 0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data type in case using relative tolerance fails because of small values */ -RelativeTolerance tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */ +RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for fp32 data type */ +constexpr float abs_tolerance_f32(0.0001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp32 data type in case using relative tolerance fails because of small values */ +constexpr float abs_tolerance_f16(0.001f); /**< Absolute tolerance value for comparing reference's output against implementation's output for fp16 data type in case using relative tolerance fails because of small values */ +RelativeTolerance tolerance_f16(half(0.01)); /**< Tolerance value for comparing reference's output against implementation's output for fp16 data type */ +constexpr AbsoluteTolerance tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ } // namespace template -using MatMulFixture = MatMulValidationFixture; +using CLMatMulFixture = MatMulValidationFixture; + +template +using CLQuantizedMatMulFixture = QuantizedMatMulValidationFixture; TEST_SUITE(CL) TEST_SUITE(MatMul) + +TEST_SUITE(Float) TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall, MatMulFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallMatMulDataset(), - framework::dataset::make("pretransose_A", { false, true })), - framework::dataset::make("pretransose_B", { false, true })), - framework::dataset::make("DataType", DataType::F32))) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F32))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); } -FIXTURE_DATA_TEST_CASE(RunLarge, MatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(), - framework::dataset::make("pretransose_A", { false, true })), - framework::dataset::make("pretransose_B", { false, true })), - framework::dataset::make("DataType", DataType::F32))) + +FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F32))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32, 0.f, abs_tolerance_f32); } + TEST_SUITE_END() // FP32 + TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, MatMulFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallMatMulDataset(), - framework::dataset::make("pretransose_A", { false, true })), - framework::dataset::make("pretransose_B", { false, true })), - framework::dataset::make("DataType", DataType::F16))) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F16))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); } -FIXTURE_DATA_TEST_CASE(RunLarge, MatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(), - framework::dataset::make("pretransose_A", { false, true })), - framework::dataset::make("pretransose_B", { false, true })), - framework::dataset::make("DataType", DataType::F16))) + +FIXTURE_DATA_TEST_CASE(RunLarge, CLMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::F16))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f16, 0.f, abs_tolerance_f16); } + TEST_SUITE_END() // FP16 +TEST_SUITE_END() // Float + +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine( + datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) +) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_quant); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine( + datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) +) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_quant); +} + +TEST_SUITE_END() // QASYMM8 + +TEST_SUITE(QASYMM8_SIGNED) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLQuantizedMatMulFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine( + datasets::SmallMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 50, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 30, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) +) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_quant); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLQuantizedMatMulFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine( + datasets::LargeMatMulDataset(), + framework::dataset::make("TransposeA", { false, true })), + framework::dataset::make("TransposeB", { false, true })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), + framework::dataset::make("NumberOfExtraRuns", { 0, 1 })), + framework::dataset::make("LhsQInfo", { QuantizationInfo(1.f / 100, 1) })), + framework::dataset::make("RhsQInfo", { QuantizationInfo(1.f / 200, -1) })), + framework::dataset::make("DstQInfo", { QuantizationInfo(1.f, 2) })) +) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_quant); +} + +TEST_SUITE_END() // QASYMM8_SIGNED + +TEST_SUITE_END() // Quantized + TEST_SUITE_END() // MatMul TEST_SUITE_END() // CL } // namespace validation diff --git a/tests/validation/fixtures/MatMulFixture.h b/tests/validation/fixtures/MatMulFixture.h index 15719024b1..2f94c1f9d2 100644 --- a/tests/validation/fixtures/MatMulFixture.h +++ b/tests/validation/fixtures/MatMulFixture.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef TESTS_VALIDATION_FIXTURES_MATMULFIXTURE -#define TESTS_VALIDATION_FIXTURES_MATMULFIXTURE +#ifndef ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE +#define ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" @@ -188,8 +188,9 @@ protected: std::vector output_multipliers{ output_multiplier }; std::vector output_shifts{ output_shift }; + //The lhs and rhs offsets are negated here to keep the reference aligned with the function implementation where the lhs and rhs offsets are also negated. const auto tmp = reference::gemmlowp_matrix_multiply_core( - a, b, c.shape(), aq.offset, bq.offset); + a, b, c.shape(), -aq.offset, -bq.offset); auto output = reference::gemmlowp_quantize_down_scale_by_fixedpoint( tmp, output_multipliers, output_shifts, oq.offset, @@ -314,4 +315,4 @@ public: } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_MATMUL_FIXTURE */ +#endif /* ACL_TESTS_VALIDATION_FIXTURES_MATMULFIXTURE */ -- cgit v1.2.1