aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2020-10-27 12:44:17 +0000
committerMichele Di Giorgio <michele.digiorgio@arm.com>2020-10-29 17:05:56 +0000
commit27d92fd5da6ad16c9e3b38d82402a86cf7b208aa (patch)
tree61438af5d104a55bbf4a90735ad430f99c73e45c /src
parent3673839cde43cc82c186a196c7e1ce3155457b0e (diff)
downloadComputeLibrary-27d92fd5da6ad16c9e3b38d82402a86cf7b208aa.tar.gz
COMPMID-3928: Fix output conversion in gemmlowp_mm_native
This patch solves the following issues that arose from nightly tests: - The accumulated result of gemmlowp_mm_native can be either uint or int and in order to be stored in memory we need to convert it to int. - The RHS matrix still needs padding on the X dimension. Hence, revert few changes to add the necessary padding elements. - Replace zero padding validation tests with assertion in the configure method of the kernel. Change-Id: Ib6614a91bd0e98f2b850f52eef14d4fbf55517c8 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4259 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl5
-rw-r--r--src/core/CL/cl_kernels/repeat.h4
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp29
3 files changed, 28 insertions, 10 deletions
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 059c2e14df..bde7dd016f 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1100,8 +1100,9 @@ __kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
const bool cond_y = y == 0;
const bool cond_x = ((x + 1) * N0 >= N);
- // Store output block
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+ // Convert and store output block
+ REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
+ STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
}
#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index 59bf5b9d8e..bed94a7b3b 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -134,6 +134,10 @@
#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
// Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index cc98845e0f..af7755b4e4 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
@@ -37,10 +37,6 @@
#include "src/core/helpers/WindowHelpers.h"
#include "support/StringSupport.h"
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
namespace arm_compute
{
using namespace misc::shape_calculator;
@@ -110,6 +106,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
Window win{};
+ bool window_changed = false;
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_output_as_3d to be false.
@@ -137,7 +134,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
num_elems_processed_per_iteration_y = lhs_info.m0;
win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ // RHS matrix still needs padding on the X
+ AccessWindowStatic input1_access(input1, 0, 0,
+ ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+ input1->dimension(1));
+
+ window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
// Collapse along the Z direction
// This collapse needs to be here in order to tune the Z dimension of LWS
@@ -145,7 +148,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
collapsed = win.collapse(win, dimension_to_collapse);
- return std::make_pair(Status(), collapsed);
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, collapsed);
}
} // namespace
@@ -175,6 +179,9 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
_use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+ // We still need padding on the X dimension for the RHS matrix
+ auto padding_info = get_padding_info({ input0, output });
+
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
@@ -197,11 +204,15 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
// If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
// we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
// This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
- const unsigned int internal_m = input0->info()->dimension(1);
+ const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
// Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+ // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+ // NOTE: This might have implications on heuristics and performance
+ const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
// Create build options
CLBuildOptions build_opts;
build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
@@ -213,7 +224,7 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
@@ -245,6 +256,8 @@ void CLGEMMLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com
_config_id += support::cpp11::to_string(rhs_info.n0);
_config_id += "_";
_config_id += support::cpp11::to_string(lhs_info.k0);
+
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
}
Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,