aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2019-03-15 10:13:05 +0000
committerGian Marco Iodice <gianmarco.iodice@arm.com>2019-03-20 11:21:46 +0000
commitb0c5037d94ba7073ccabb0ebaff54db320f184c4 (patch)
tree126f2332df60b6eff1e630b2585b2bd407501a20 /src/core
parent5ed7b5bc98feb848874730c9bb9c30759e58d453 (diff)
downloadComputeLibrary-b0c5037d94ba7073ccabb0ebaff54db320f184c4.tar.gz
COMPMID-2043: Add support for "dummy threads" in CLGEMMReshaped
Change-Id: I89403b97503fbb99f6a32f5d62b8c535ab26a7be Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/877 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/CL/CLHelpers.cpp9
-rw-r--r--src/core/CL/ICLKernel.cpp11
-rw-r--r--src/core/CL/cl_kernels/gemm.cl44
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl11
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp9
7 files changed, 76 insertions, 24 deletions
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 18ef185ac0..801347e200 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -255,4 +255,11 @@ size_t preferred_vector_width(const cl::Device &device, const DataType dt)
return 1;
}
}
+
+bool preferred_dummy_work_items_support(const cl::Device &device)
+{
+ ARM_COMPUTE_UNUSED(device);
+ // TODO (COMPMID-2044)
+ return true;
+}
} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 995fcb481b..2d28a496c9 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,7 +36,7 @@
using namespace arm_compute;
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint)
+void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
{
if(kernel.kernel()() == nullptr)
{
@@ -58,6 +58,13 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
return;
}
+ // Use dummy work-items
+ if(use_dummy_work_items)
+ {
+ gws.get()[0] = get_next_power_two(gws[0]);
+ gws.get()[1] = get_next_power_two(gws[1]);
+ }
+
cl::NDRange valid_lws;
if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
{
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 211484440b..45c600cd37 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1128,7 +1128,7 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
#endif // defined(TRANSPOSE)
#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
#define CONCAT(a, b) a##b
@@ -1256,6 +1256,8 @@ __kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
*
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (i.e. -DM=52, -DN=30 and -DK=90)
* @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
* @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
@@ -1332,6 +1334,13 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
uint y = get_global_id(1);
uint z = get_global_id(2);
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
// Compute LHS matrix address
uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
@@ -1534,15 +1543,6 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
// Left-over accumulations
for(; i < K; ++i)
{
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
// Load values from LHS matrix
DATA_TYPE a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
#if M0 > 1
@@ -1852,6 +1852,8 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
* The LHS matrix is NOT reshaped
* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
*
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (i.e. -DM=52, -DN=30 and -DK=90).
* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
* @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
@@ -1860,7 +1862,7 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
* - M0 = 1, 2, 3, 4, 5, 6, 7, 8
* - N0 = 2, 3, 4, 8, 16
* - K0 = 2, 3, 4, 8, 16
- * - H0 > 1
+ * - H0 >= 1
*
* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
@@ -1927,6 +1929,13 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
uint y = get_global_id(1);
uint z = get_global_id(2);
+#if defined(DUMMY_WORK_ITEMS)
+ if((x * N0 >= N) || (y * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
// Compute LHS matrix address
uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
@@ -2259,9 +2268,9 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
#undef RHS_OFFSET_X
#undef RHS_STEP_X
}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N)
#if K0 == 2
#define ARM_DOT_K0(a, b, c) \
@@ -2381,6 +2390,8 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
*
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
@@ -2461,6 +2472,13 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
#define RHS_STEP_LOOP (H0)
#endif // defined(RHS_INTERLEAVE)
+#if defined(DUMMY_WORK_ITEMS)
+ if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
// Compute LHS matrix address
__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
(get_global_id(2) * lhs_stride_z);
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 277338bf08..8dc22d7d56 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1944,7 +1944,7 @@ __kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0)
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
@@ -2103,6 +2103,8 @@ __kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
*
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
@@ -2183,6 +2185,13 @@ __kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
#define RHS_STEP_LOOP (H0)
#endif // defined(RHS_INTERLEAVE)
+#if defined(DUMMY_WORK_ITEMS)
+ if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+ {
+ return;
+ }
+#endif // defined(DUMMY_WORK_ITEMS)
+
// Compute LHS matrix address
__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X + (get_global_id(1) / V0) * (uint)lhs_stride_y + (get_global_id(
2)
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index e9be1a6dfc..a8c1704d91 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
@@ -165,7 +165,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
} // namespace
CLGEMMLowpMatrixMultiplyReshapedKernel::CLGEMMLowpMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
{
}
@@ -181,6 +181,7 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0,
_output = output;
_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
_k = gemm_info.k();
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
// Check if we need to slide the matrix B
const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -201,6 +202,9 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0,
build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -302,7 +306,7 @@ void CLGEMMLowpMatrixMultiplyReshapedKernel::run(const Window &window, cl::Comma
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
while(window.slide_window_slice_3D(slice));
} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index b6816ac5c4..8969124d71 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -168,7 +168,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
} // namespace
CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false)
{
}
@@ -184,6 +184,7 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, cons
_output = output;
_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
_k = gemm_info.k();
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
// Check if we need to slide the matrix B
const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -206,6 +207,9 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const ICLTensor *input0, cons
build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -308,7 +312,7 @@ void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQu
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
while(window.slide_window_slice_3D(slice));
} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 6dc2a9e610..af06fecd00 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -161,7 +161,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
} // namespace
CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false)
{
}
@@ -177,6 +177,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input
_output = output;
_reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
_reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+ _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
// In case both input and output have to be reinterpreted as 3D tensors,
// force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -202,13 +203,15 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input
build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+ build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+ build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+ build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
-
std::string kernel_name("gemm_mm_reshaped_only_rhs_");
kernel_name += rhs_info.transpose ? "t" : "nt";
@@ -308,7 +311,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
+ enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
}
while(window.slide_window_slice_3D(slice));
}