From 27d92fd5da6ad16c9e3b38d82402a86cf7b208aa Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Tue, 27 Oct 2020 12:44:17 +0000 Subject: COMPMID-3928: Fix output conversion in gemmlowp_mm_native This patch solves the following issues that arose from nightly tests: - The accumulated result of gemmlowp_mm_native can be either uint or int and in order to be stored in memory we need to convert it to int. - The RHS matrix still needs padding on the X dimension. Hence, revert few changes to add the necessary padding elements. - Replace zero padding validation tests with assertion in the configure method of the kernel. Change-Id: Ib6614a91bd0e98f2b850f52eef14d4fbf55517c8 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4259 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/gemmlowp.cl | 5 +- src/core/CL/cl_kernels/repeat.h | 4 ++ .../CLGEMMLowpMatrixMultiplyNativeKernel.cpp | 29 ++++++--- .../validation/CL/GEMMLowpMatrixMultiplyNative.cpp | 71 ---------------------- 4 files changed, 28 insertions(+), 81 deletions(-) diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index 059c2e14df..bde7dd016f 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -1100,8 +1100,9 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), const bool cond_y = y == 0; const bool cond_x = ((x + 1) * N0 >= N); - // Store output block - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); + // Convert and store output block + REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0)); + STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); } #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h index 59bf5b9d8e..bed94a7b3b 100644 --- a/src/core/CL/cl_kernels/repeat.h +++ b/src/core/CL/cl_kernels/repeat.h @@ -134,6 +134,10 @@ #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) +#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) +#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) + +// Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...) #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) #define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp index cc98845e0f..af7755b4e4 100644 --- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp +++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp @@ -37,10 +37,6 @@ #include "src/core/helpers/WindowHelpers.h" #include "support/StringSupport.h" -#include -#include -#include - namespace arm_compute { using namespace misc::shape_calculator; @@ -110,6 +106,7 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); Window win{}; + bool window_changed = false; // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_output_as_3d to be false. @@ -137,7 +134,13 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe num_elems_processed_per_iteration_y = lhs_info.m0; win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + // RHS matrix still needs padding on the X + AccessWindowStatic input1_access(input1, 0, 0, + ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), + input1->dimension(1)); + + window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -145,7 +148,8 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe const unsigned int dimension_to_collapse = std::min(static_cast(output->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - return std::make_pair(Status(), collapsed); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); } } // namespace @@ -175,6 +179,9 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + // We still need padding on the X dimension for the RHS matrix + auto padding_info = get_padding_info({ input0, output }); + // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) @@ -197,11 +204,15 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m - const unsigned int internal_m = input0->info()->dimension(1); + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1); // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. const unsigned int partial_store_m0 = internal_m % lhs_info.m0; const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + // Create build options CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); @@ -213,7 +224,7 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1))); build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())); @@ -245,6 +256,8 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com _config_id += support::cpp11::to_string(rhs_info.n0); _config_id += "_"; _config_id += support::cpp11::to_string(lhs_info.k0); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp index 9e717dfac9..ce000bd8e1 100644 --- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp +++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp @@ -88,81 +88,10 @@ const auto n0_values_nightly = framework::dataset::make("N0", { 1, 2, 3, 4, 8 }) /** K0 values to test - Nightly */ const auto k0_values_nightly = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 }); - -/** Zero padding test */ -bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info) -{ - const unsigned int M = m_value; - const unsigned int N = n_value; - const unsigned int K = k_value; - - GEMMLHSMatrixInfo lhs_info; - lhs_info.m0 = m0_value; - lhs_info.k0 = k0_value; - - GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = n0_value; - rhs_info.k0 = k0_value; - - GEMMKernelInfo kernel_info; - kernel_info.m = M; - kernel_info.n = N; - kernel_info.k = K; - kernel_info.broadcast_bias = broadcast_bias; - kernel_info.activation_info = act_info; - - const TensorShape lhs_shape(K, M, b_value); - const TensorShape rhs_shape(N, K, b_value); - const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, data_type), - TensorInfo(rhs_shape, 1, data_type), - kernel_info); - - // Create tensors - CLTensor lhs = create_tensor(lhs_shape, data_type); - CLTensor rhs = create_tensor(rhs_shape, data_type); - CLTensor dst = create_tensor(dst_shape, DataType::S32); - - ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); - - // Create and configure function - CLGEMMLowpMatrixMultiplyNative gemm; - gemm.configure(&lhs, &rhs, &dst, lhs_info, rhs_info, GEMMReshapeInfo(m_value, n_value, k_value)); - - // Padding can be added along rhs and bias's X dimension - return dst.info()->padding().empty() && lhs.info()->padding().empty() && rhs.info()->padding().empty(); -} } // namespace TEST_SUITE(CL) TEST_SUITE(GEMMLowpMatrixMultiplyNative) - -/** Validate zero padding tests - * - * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios. - * - * Checks performed in order: - * - No partial blocks in both x and y dimensions - * - Partial blocks in x dimension - * - Partial blocks in y dimension - * - Partial blocks in both x and y dimensions - * - No blocks in both x and y dimensions, scalar store (N0==1) - * - Special case: partial_n0 == 5 (vstore1 should be invoked instead of vstore_partial_1) - */ -DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip( -framework::dataset::make("M", { 24, 63, 1, 51, 255, }), -framework::dataset::make("N", { 47, 29, 122, 20, 21, })), -framework::dataset::make("M0", { 4, 8, 2, 1, 8, })), -framework::dataset::make("N0", { 4, 4, 3, 1, 8, })), -m_value, n_value, m0_value, n0_value) -{ - bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, false, DataType::QASYMM8, ActivationLayerInfo()); - ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS); -} - - - FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyNativeFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(m_values, n_values), -- cgit v1.2.1