aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-12-07 16:47:52 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commit358ca205c9e41f523517ffa55a9057308b736040 (patch)
tree5d86460e73e9ad99837ae0c9c903448592d8c848 /src
parenteb8f71eecbc44e64cd7814f53b27b42c43dd660b (diff)
downloadComputeLibrary-358ca205c9e41f523517ffa55a9057308b736040.tar.gz
COMPMID-617: Adds CLFullyConnectionLayer validation support
Change-Id: I4d2eb9872a3165fdcaa7784596e441cbe563dbc2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112577 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Ioan-Cristian Szabo <ioan-cristian.szabo@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp89
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp135
-rw-r--r--src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp134
-rw-r--r--src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp104
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp65
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp166
-rw-r--r--src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp97
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp34
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp8
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp129
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp104
11 files changed, 773 insertions, 292 deletions
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 7741f12900..6886f54602 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -33,8 +33,55 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type());
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+ bool window_changed = false;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+
+ // Configure window in case of configured output
+ if(output->total_size() != 0)
+ {
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, input->valid_region());
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
: _input(nullptr), _output(nullptr)
@@ -43,22 +90,13 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
- DataType::U32, DataType::S32,
- DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
- TensorShape output_shape = input->info()->tensor_shape();
- output_shape.set(0, input->info()->dimension(0) * 4);
- output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
@@ -68,20 +106,9 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
// Configure kernel window
- const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
- const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
-
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, input->info()->valid_region());
-
- ICLKernel::configure(win);
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
// Set config_id for enabling LWS tuning
_config_id = "interleave4x4_";
@@ -92,6 +119,14 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
_config_id += support::cpp11::to_string(output->info()->dimension(1));
}
+Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+ return Status{};
+}
+
void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 1d9fe4bc01..423592b79c 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -46,79 +46,114 @@ namespace arm_compute
class Coordinates;
} // namespace arm_compute
-CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr)
+namespace
{
-}
+using ElementsProcessed = Steps;
-void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
if(!is_interleaved_transposed)
{
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
}
- _input0 = input0;
- _input1 = input1;
- _output = output;
+ return Status{};
+}
- CLBuildOptions build_opts;
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed,
+ ElementsProcessed &num_elements_processed)
+{
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+ Window win{};
+ bool window_changed = false;
+ // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
if(is_interleaved_transposed)
{
- // Create kernel and set static arguments
- build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options()));
-
// Configure window
- constexpr unsigned int num_elems_processed_per_iteration_x = 16;
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ num_elems_processed_per_iteration_x = 16;
+ num_elems_processed_per_iteration_y = 4;
constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
- AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- update_window_and_padding(win, input0_access, input1_access, output_access);
+ AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration_input0, 1);
+ AccessWindowRectangle input1_access(input1, 0, 0, num_elems_read_per_iteration_input1, 1);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+ window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
- ICLKernel::configure(win);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
}
else
{
// Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
- constexpr unsigned int num_elems_processed_per_iteration_x = 16;
- const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
-
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
-
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options()));
+ num_elems_processed_per_iteration_x = 16;
+ num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
// Configure window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
- AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- update_window_and_padding(win, input0_access, input1_access, output_access);
+ window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+ coord.set_num_dimensions(output->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
- ICLKernel::configure(win);
+ // Create build options
+ CLBuildOptions build_opts;
+ std::string kernel_name(" ");
+ if(is_interleaved_transposed)
+ {
+ build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+ kernel_name = "gemmlowp_mm_interleaved_transposed";
}
+ else
+ {
+ build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+ build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
+ build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
+ kernel_name = "gemmlowp_mm";
+ }
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
// Set config_id for enabling LWS tuning
_config_id = "gemmlowp_";
@@ -132,6 +167,20 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
+Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ is_interleaved_transposed,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index 2877a74be8..d05939fcf5 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -44,89 +44,135 @@ namespace arm_compute
class Coordinates;
} // namespace arm_compute
-CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
- : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+namespace
{
-}
-
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-
- // Set the arguments to pass at compile time
- CLBuildOptions build_opts;
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
// If a_offset == 0, vector_sum_col can be a nullptr
if(a_offset != 0)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
-
- build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
- build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
}
// If b_offset == 0, vector_sum_row can be a nullptr
if(b_offset != 0)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
- // Validate batches
- TensorShape output_shape = mm_result->info()->tensor_shape();
+ TensorShape output_shape = mm_result->tensor_shape();
if(output_shape.num_dimensions() > 1)
{
- TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+ TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
vector_sum_row_shape.collapse_from(1);
output_shape.collapse_from(2);
- ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+ "mm_result tensor must have the same number of batches of output tensor");
if(a_offset != 0)
{
- TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+ TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
vector_sum_col_shape.collapse_from(1);
- ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
- && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
}
}
-
- build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset));
}
- build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
-
- _vector_sum_col = vector_sum_col;
- _vector_sum_row = vector_sum_row;
- _mm_result = mm_result;
+ return Status{};
+}
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
+{
constexpr unsigned int num_elems_processed_per_iteration = 16;
+ bool window_changed = false;
// Configure kernel window
- Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, mm_result_access);
+ AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win,
+ mm_result_access);
if(a_offset != 0)
{
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, vector_sum_col_access);
+ AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win,
+ vector_sum_col_access);
}
-
if(b_offset != 0)
{
- AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
- update_window_and_padding(win, vector_sum_row_access);
+ AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+ window_changed = window_changed || update_window_and_padding(win,
+ vector_sum_row_access);
}
- ICLKernel::configure(win);
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
+ : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+ a_offset, b_offset)); // NOLINT
+
+ _vector_sum_col = vector_sum_col;
+ _vector_sum_row = vector_sum_row;
+ _mm_result = mm_result;
+
+ // Set the arguments to pass at compile time
+ CLBuildOptions build_opts;
+
+ // If a_offset == 0, vector_sum_col can be a nullptr
+ if(a_offset != 0)
+ {
+ build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+ build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+ }
+ // If b_offset == 0, vector_sum_row can be a nullptr
+ build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+ build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(mm_result->info(),
+ vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+ a_offset, b_offset); // NOLINT
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+}
+
+Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+ int32_t a_offset, int32_t b_offset)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+ vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+ vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+ a_offset, b_offset)
+ .first); // NOLINT
+
+ return Status{};
}
void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index bcf04b0982..6951512167 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -44,6 +44,59 @@ namespace arm_compute
class Coordinates;
} // namespace arm_compute
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ return Status{};
+}
+std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
: _input(), _output()
{
@@ -51,8 +104,9 @@ ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
_input = mtx_a;
_output = vector_sum_row;
@@ -64,21 +118,18 @@ void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTens
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
- const unsigned int num_elems_processed_per_iteration = 1;
-
// Configure kernel window
- Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
- AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- input_access,
- output_access);
+ auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+}
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get()).first);
- ICLKernel::configure(win);
+ return Status{};
}
void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -107,8 +158,8 @@ void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueu
void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
_input = mtx_b;
_output = vector_sum_col;
@@ -121,21 +172,18 @@ void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTens
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options()));
- constexpr unsigned int num_elems_processed_per_iteration = 16;
-
// Configure kernel window
- Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), num_elems_processed_per_iteration), _input->info()->dimension(1));
- AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win,
- input_access,
- output_access);
+ auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+}
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
- ICLKernel::configure(win);
+ return Status{};
}
void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 015b4f70a4..d5c93dd24a 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -36,6 +36,37 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
+ unsigned int &num_elems_processed_per_iteration)
+{
+ // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+ num_elems_processed_per_iteration = (gpu_target == GPUTarget::BIFROST) ? 8 : 16;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1));
+ AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
+
+ bool window_changed = update_window_and_padding(win, biases_access, accum_access);
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
: _accum(nullptr), _biases(nullptr)
{
@@ -43,18 +74,21 @@ CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
- ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
_biases = biases;
_accum = accum;
// Get the target architecture
- GPUTarget arch_target = get_arch_from_target(get_target());
- // Select the vector size to use (8 for Bifrost; 16 for Midgard).
- const unsigned int vector_size = (arch_target == GPUTarget::BIFROST) ? 8 : 16;
+ GPUTarget arch_target = get_arch_from_target(get_target());
+ unsigned int vector_size = 0;
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(accum->info(), biases->info(), arch_target, vector_size);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
// Add build options
CLBuildOptions build_opts;
@@ -65,18 +99,15 @@ void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTe
// Create kernel
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
+}
- // Configure kernel window
- const unsigned int num_elems_processed_per_iteration = vector_size;
-
- Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
-
- AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
- AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
-
- update_window_and_padding(win, biases_access, accum_access);
+Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target)
+{
+ unsigned int num_elems_processed_per_iteration = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first);
- ICLKernel::configure(win);
+ return Status{};
}
void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 16706dd748..f51d0f92d4 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -42,6 +42,81 @@
using namespace arm_compute;
+namespace
+{
+using ElementsProcessed = Steps;
+
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+ if(!is_interleaved_transposed)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+ }
+
+ return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
+ bool is_interleaved_transposed, GPUTarget gpu_target,
+ ElementsProcessed &num_elements_processed)
+{
+ bool window_changed = false;
+ Window win{};
+
+ const DataType data_type = input0->data_type();
+ unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+ unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+ if(is_interleaved_transposed)
+ {
+ // Configure kernel window
+ num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
+ num_elems_processed_per_iteration_y = 4;
+
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+ AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+ }
+ else // The input tensors have not been reshaped
+ {
+ // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
+ num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
+ num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+
+ // Create kernels according to the architecture, data type and input size.
+ if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
+ {
+ num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000) ? 2 : 4;
+ }
+
+ // Configure window
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+ AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
: _input0(nullptr), _input1(nullptr), _output(nullptr)
{
@@ -49,13 +124,10 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
- if(!is_interleaved_transposed)
- {
- ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
- }
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
_input0 = input0;
_input1 = input1;
@@ -82,14 +154,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
_lws_hint = cl::NDRange(8, 8);
}
+ ElementsProcessed num_elements_processed{};
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
// Create build options
CLBuildOptions build_opts;
build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));
- const bool multiply_alpha = std::abs(1.0f - alpha) > 0.00001f;
-
// Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
- if(multiply_alpha)
+ if(std::abs(1.0f - alpha) > 0.00001f)
{
build_opts.add_option_if_else(is_data_type_fixed_point(data_type),
"-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),
@@ -108,49 +185,19 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
{
kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
}
-
- // Configure kernel window
- const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
- AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
}
else // The input tensors have not been reshaped
{
build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
- unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
- const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
-
// Create kernels according to the architecture, data type and input size.
if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
{
// The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
// FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
// FC6 and FC7 of AlexNet and VGG-16).
- if(input1->info()->dimension(0) <= 1000)
- {
- // Each work-item processes 2 elements in the X dimension.
- num_elems_processed_per_iteration_x = 2;
- kernel_name = "gemm_mm_floating_point_f32_bifrost_1000";
- }
- else
- {
- // Each work-item processes 4 elements in the X dimension (as in the default case).
- num_elems_processed_per_iteration_x = 4;
- kernel_name = "gemm_mm_floating_point_f32_bifrost";
- }
+ kernel_name = (input1->info()->dimension(0) <= 1000) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
+
// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
// via exhaustive autotuning over a range of representative layer configurations.
_lws_hint = cl::NDRange(4);
@@ -164,23 +211,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
kernel_name = "gemm_mm_floating_point";
}
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
-
- // Configure window
- Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
- AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
- AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- Coordinates coord;
- coord.set_num_dimensions(output->info()->num_dimensions());
- output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
-
- ICLKernel::configure(win);
+ build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
+ build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
}
// Create kernel
@@ -198,6 +230,22 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
_config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
}
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target)
+{
+ ElementsProcessed num_elements_processed{};
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+ input1->clone().get(),
+ output->clone().get(),
+ is_interleaved_transposed,
+ gpu_target,
+ num_elements_processed)
+ .first);
+
+ return Status{};
+}
+
void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 35074f94cf..69a545b76b 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -33,36 +33,82 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include <cmath>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
-void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
- DataType::U32, DataType::S32, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ compute_transpose1xW_with_element_size_shape(*input));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
- TensorShape output_shape{ input->info()->tensor_shape() };
- const size_t transpose_w = 16 / input->info()->element_size();
- output_shape.set(0, input->info()->dimension(1) * transpose_w);
- output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+ return Status{};
+}
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+{
+ num_elems_processed_per_iteration = 16 / input->element_size();
+
+ const int scale_x = num_elems_processed_per_iteration;
+ bool window_changed = false;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ if((win.x().end() / scale_x) == 0)
+ {
+ return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win);
+ }
+
+ AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+
+ // Configure window in case of configured output
+ if(output->total_size() != 0)
+ {
+ AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+ window_changed = window_changed || update_window_and_padding(win, output_access);
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+ }
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info())));
- const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
- const int scale_x = num_elems_processed_per_iteration;
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
+ // Configure kernel window
+ unsigned int num_elems_processed_per_iteration = 1;
+ auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
/*
* Following an example of how the transposition1xW works when the input data type is F32
*
@@ -76,20 +122,15 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp
// Create kernel
std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+}
- // Configure window
- Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
- ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
-
- AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
- AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
-
- update_window_and_padding(win, input_access, output_access);
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ unsigned int num_elems_processed_per_iteration = 1;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
- ICLKernel::configure(win);
+ return Status{};
}
void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 6514d6cf91..0e9f2c5344 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -39,6 +39,24 @@
using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ }
+
+ return Status{};
+}
+} // namespace
+
CLIm2ColKernel::CLIm2ColKernel()
: _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
{
@@ -46,9 +64,10 @@ CLIm2ColKernel::CLIm2ColKernel()
void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
_input = input;
_output = output;
@@ -184,6 +203,15 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
_config_id += support::cpp11::to_string(output->info()->dimension(1));
}
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+ ARM_COMPUTE_UNUSED(kernel_dims);
+ ARM_COMPUTE_UNUSED(conv_info);
+ ARM_COMPUTE_UNUSED(has_bias);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ return Status{};
+}
+
void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
{
ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index f6964002dd..3d41548a6a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -71,16 +71,16 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
vector_sum_row_shape.collapse_from(1);
output_shape.collapse_from(2);
- ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+ "mm_result tensor must have the same number of batches of output tensor");
if(a_offset != 0)
{
TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
vector_sum_col_shape.collapse_from(1);
- ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
- && vector_sum_col_shape[1] != vector_sum_row_shape[1],
- "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+ "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
}
}
}
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 7fd81cdb94..68c6576a79 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
@@ -32,6 +33,34 @@
#include <algorithm>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed)
+{
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ if(is_data_type_quantized_asymmetric(input.data_type()))
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset);
+ const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset);
+
+ // Validate gemmlowp function
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
+ &weights.clone()->set_quantization_info(weights_quantization_info),
+ &output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target));
+ }
+
+ return Status{};
+}
+} // namespace
void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
{
@@ -40,6 +69,11 @@ void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLT
_kernel = std::move(k);
}
+Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLTransposeKernel::validate(input, output);
+}
+
CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
_gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
@@ -80,8 +114,7 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
- TensorShape shape_im2col = input->info()->tensor_shape();
- shape_im2col.collapse(3);
+ TensorShape shape_im2col = compute_im2col_shape(*input->info());
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
// Configure im2col kernel
@@ -105,9 +138,15 @@ void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTen
void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
+ weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(),
+ transpose_weights,
+ are_weights_reshaped));
_are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
_is_fc_after_conv = true;
@@ -192,6 +231,86 @@ void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *w
}
}
+Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+ bool weights_reshaped = transpose_weights ? are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
+ bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+ const GPUTarget gpu_target = CLScheduler::get().target();
+
+ const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(*input)));
+ const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+ const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+ // Configure accumulate biases kernel for non quantized asymmetric types
+ if(biases != nullptr && !is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+ }
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
+
+ if(!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+ if(is_batched_fc_layer)
+ {
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
+ input->tensor_shape().cend(),
+ output->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ is_fc_after_conv = input->num_dimensions() > 1;
+ }
+
+ if(is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+ // Validate im2col kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false));
+ input_to_use = &im2col_input;
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+ }
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false));
+
+ // Validate output stage for asymmetric quantized types
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
+ }
+
+ return Status{};
+}
+
void CLFullyConnectedLayer::run()
{
// Reshape of the weights (happens only once)
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 5c6f5b4ed0..ddcab6a256 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -29,9 +29,11 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
@@ -41,14 +43,9 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+ ARM_COMPUTE_UNUSED(gemm_info);
+ ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_a_offset = a->info()->quantization_info().offset;
@@ -65,18 +62,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
+ TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
_tmp_a.allocator()->init(info_a);
_tmp_b.allocator()->init(info_b);
_memory_group.manage(&_tmp_a);
@@ -95,13 +82,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
if(_a_offset != 0)
{
- TensorShape shape_vector_sum_col = b->info()->tensor_shape();
-
- if(shape_vector_sum_col.num_dimensions() > 1)
- {
- shape_vector_sum_col.remove_dimension(1);
- }
- TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+ TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
_memory_group.manage(&_vector_sum_col);
@@ -112,13 +93,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
{
- TensorShape shape_vector_sum_row = a->info()->tensor_shape();
- shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
- if(a->info()->num_dimensions() > 1)
- {
- shape_vector_sum_row.remove_dimension(1);
- }
- TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+ TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
_vector_sum_row.allocator()->init(info_vector_sum_row);
_memory_group.manage(&_vector_sum_row);
@@ -147,6 +122,67 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
}
}
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
+ "The output matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
+ "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ int32_t a_offset = a->quantization_info().offset;
+ int32_t b_offset = b->quantization_info().offset;
+ bool is_interleaved_transposed = a->dimension(1) > 16;
+
+ if(is_interleaved_transposed)
+ {
+ TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type());
+ TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ }
+
+ TensorInfo info_vector_sum_col, info_vector_sum_row;
+
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if(a_offset != 0)
+ {
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(b_offset != 0)
+ {
+ info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
+
+ return Status{};
+}
+
void CLGEMMLowpMatrixMultiplyCore::run()
{
_memory_group.acquire();