aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2023-12-22 15:43:29 +0000
committerGunes Bayir <gunes.bayir@arm.com>2023-12-22 17:35:16 +0000
commitc310c11a4baa1eca4e4007b3250f4a771989f36f (patch)
tree4992c6b0978fce5aeabeea23678be6e1a13c6c6b
parent85cafff0dd99b6f94a77a7d7933682fa7c6a4a70 (diff)
downloadComputeLibrary-c310c11a4baa1eca4e4007b3250f4a771989f36f.tar.gz
Fix nightly issue caused by gemm_reshaped_only_rhs_mmul kernel
The issue appears when this kernel is used by convolution operators because the stride calculations consider only simple matrix multiplication. In conv2d triggered runs, Rhs does not have the same dimension as Lhs and Dst. Also, cases where Lhs and Dst are interpreted as 3d, where their X and Y dimensions (in convolution sense) are collapsed into one. Resolves: COMPMID-6764 Change-Id: If443e6eb8f7a5cca1acc58b37c598122a013e69b Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10913 Benchmark: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl32
-rw-r--r--src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp9
2 files changed, 36 insertions, 5 deletions
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
index 8919023d4c..09b8956b68 100644
--- a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -117,9 +117,23 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
uint rhs_y = block_id;
// Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+ lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+ rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+ dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
// Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
// not sliding the tensor
@@ -367,11 +381,25 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
// Starting RHS coordinates
uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+ uint rhs_y = block_id;
+#endif // BATCHED_RHS
// Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+ lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+ dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
// Initialize the accumulators
// MMUL extension accumulate the result in F32 for both F32 and F16
@@ -525,4 +553,4 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
#undef RHS_OFFSET_X
#undef RHS_STEP_X
}
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE) \ No newline at end of file
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 4ca5580443..4ca4b83f9c 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -41,7 +41,6 @@
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
#include "support/Cast.h"
#include "support/StringSupport.h"
-
namespace arm_compute
{
namespace opencl
@@ -101,7 +100,7 @@ Status validate_arguments(const ITensorInfo *src0,
ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
// Validate the reinterpreted-as-3D-case
- if (gemm_info.reinterpret_input_as_3d != 0)
+ if (gemm_info.reinterpret_input_as_3d)
{
ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
}
@@ -284,8 +283,12 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon
build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+ build_opts.add_option_if(gemm_info.reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+ build_opts.add_option_if(gemm_info.depth_output_gemm3d != 0, "-DREINTERPRET_OUTPUT_AS_3D");
+ build_opts.add_option_if(src1->num_dimensions() > 2, "-DBATCHED_RHS");
+
std::string kernel_name("gemm_mm_reshaped_only_rhs_nt_mmul");
- kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+ kernel_name += _export_to_cl_image ? "_texture" : "";
// A macro guard to compile ONLY the kernel of interest
build_opts.add_option("-D" + upper_string(kernel_name));