aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2020-09-04 15:01:15 +0100
committerMichele Di Giorgio <michele.digiorgio@arm.com>2020-09-07 13:04:49 +0000
commit7c850d5ca1aa0dd255eed794134535ece7849f96 (patch)
tree40106749c1d52a3fc7f17e800c81bf15e5891b41 /src
parent8a14b2ca62c43a2691066ce374949c2501ae8315 (diff)
downloadComputeLibrary-7c850d5ca1aa0dd255eed794134535ece7849f96.tar.gz
COMPMID-3155: Remove padding from NEGEMMLowpOffsetContributionKernel
Change-Id: I93c3b795cf6fe0b27008543b6671a3be0a965603 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3916 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp342
1 files changed, 178 insertions, 164 deletions
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index db6cb10995..5710897329 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -34,16 +34,9 @@
#include "arm_compute/core/Window.h"
#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
namespace arm_compute
{
-class Coordinates;
-} // namespace arm_compute
-
namespace
{
Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
@@ -96,42 +89,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
- int32_t a_offset, int32_t b_offset)
-{
- constexpr unsigned int num_elems_processed_per_iteration = 16;
- bool window_changed = false;
-
- // Configure kernel window
- Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
-
- AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, mm_result_access);
-
- if(a_offset != 0)
- {
- AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
- }
- if(b_offset != 0)
- {
- AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
- window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
void run_offset_contribution(const Window &window,
ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
{
Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+ collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ const int window_step_x = 16;
+
+ Iterator mm_result_it(mm_result, collapsed_window);
+
if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
{
// Set window for vector_sum_col
@@ -147,7 +120,6 @@ void run_offset_contribution(const Window &window,
Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
- Iterator mm_result_it(mm_result, window);
const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
@@ -156,67 +128,86 @@ void run_offset_contribution(const Window &window,
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-
- // Compute the leftover term due to a_offset.
- int32x4x4_t a_offset_term_s32 =
- {
- {
- vld1q_s32(vector_sum_col_ptr + 0),
- vld1q_s32(vector_sum_col_ptr + 4),
- vld1q_s32(vector_sum_col_ptr + 8),
- vld1q_s32(vector_sum_col_ptr + 12)
- }
- };
-
- a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
- a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
- a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
- a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+ const int batch_id = id.z() / depth_input;
+ auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
// Compute the leftover term due to b_offset.
- int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
- + (id.z() % depth_input) * height_input);
- b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+ int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
+ b_offset_term_s32 *= b_offset;
+
+ const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
- // Add a_offset_term_s32 and b_offset_term_s32
- int32x4x4_t offset_term_s32 =
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
{
+ // Compute the leftover term due to a_offset.
+ int32x4x4_t a_offset_term_s32 =
{
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset),
- vdupq_n_s32(k_offset)
- }
- };
-
- offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
- offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
- offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
- offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
-
- int32x4x4_t in_s32 =
- {
+ {
+ vld1q_s32(vector_sum_col_ptr + x + 0),
+ vld1q_s32(vector_sum_col_ptr + x + 4),
+ vld1q_s32(vector_sum_col_ptr + x + 8),
+ vld1q_s32(vector_sum_col_ptr + x + 12)
+ }
+ };
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+ // Add a_offset_term_s32 and b_offset_term_s32
+ int32x4x4_t offset_term_s32 =
+ {
+ {
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset),
+ vdupq_n_s32(k_offset)
+ }
+ };
+
+ offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+ offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+ offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+ offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+ int32x4x4_t in_s32 =
{
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
- // Store the result with the offset contribution
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+ {
+ vld1q_s32(mm_result_ptr + x + 0),
+ vld1q_s32(mm_result_ptr + x + 4),
+ vld1q_s32(mm_result_ptr + x + 8),
+ vld1q_s32(mm_result_ptr + x + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+ // Store the result with the offset contribution
+ vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+ vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+ vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+ vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+ }
+
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
+ {
+ // Compute the leftover term due to a_offset.
+ int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+ a_offset_term_s32 *= a_offset;
+
+ // Add the offset terms to GEMM's result
+ // Store the result with the offset contribution
+ mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+ }
},
vector_sum_col_it, vector_sum_row_it, mm_result_it);
}
@@ -231,40 +222,53 @@ void run_offset_contribution(const Window &window,
win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
- Iterator mm_result_it(mm_result, window);
const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
- const int batch_id = id.z() / depth_input;
+ const int batch_id = id.z() / depth_input;
+ auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
// Compute the leftover term due to b_offset.
- int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
- + (id.z() % depth_input) * height_input);
- b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+ int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
+ b_offset_term_s32 *= b_offset;
- int32x4x4_t in_s32 =
+ const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
{
+ int32x4x4_t in_s32 =
{
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
-
- // Store the result with the offset contribution
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+ {
+ vld1q_s32(mm_result_ptr + x + 0),
+ vld1q_s32(mm_result_ptr + x + 4),
+ vld1q_s32(mm_result_ptr + x + 8),
+ vld1q_s32(mm_result_ptr + x + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+
+ // Store the result with the offset contribution
+ vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+ vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+ vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+ vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+ }
+
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
+ {
+ // Add the offset terms to GEMM's result
+ // Store the result with the offset contribution
+ mm_result_ptr[x] += b_offset_term_s32;
+ }
},
vector_sum_row_it, mm_result_it);
}
@@ -276,53 +280,68 @@ void run_offset_contribution(const Window &window,
win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
- Iterator mm_result_it(mm_result, window);
// Offset in case vector_sum_col is batched
const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
- execute_window_loop(window, [&](const Coordinates & id)
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
- const int batch_id = id.z() / depth_input;
- const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ const int batch_id = id.z() / depth_input;
+ auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
- // Compute the leftover term due to a_offset.
- int32x4x4_t a_offset_term_s32 =
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
{
+ // Compute the leftover term due to a_offset.
+ int32x4x4_t a_offset_term_s32 =
{
- vld1q_s32(vector_sum_col_ptr + 0),
- vld1q_s32(vector_sum_col_ptr + 4),
- vld1q_s32(vector_sum_col_ptr + 8),
- vld1q_s32(vector_sum_col_ptr + 12)
- }
- };
-
- a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
- a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
- a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
- a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
-
- int32x4x4_t in_s32 =
- {
+ {
+ vld1q_s32(vector_sum_col_ptr + x + 0),
+ vld1q_s32(vector_sum_col_ptr + x + 4),
+ vld1q_s32(vector_sum_col_ptr + x + 8),
+ vld1q_s32(vector_sum_col_ptr + x + 12)
+ }
+ };
+
+ a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+ a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+ a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+ a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+ int32x4x4_t in_s32 =
{
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
- vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
- }
- };
-
- // Add the offset terms to GEMM's result
- in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
- in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
- in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
- in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
- // Store the result with the offset contribution
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
- vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+ {
+ vld1q_s32(mm_result_ptr + x + 0),
+ vld1q_s32(mm_result_ptr + x + 4),
+ vld1q_s32(mm_result_ptr + x + 8),
+ vld1q_s32(mm_result_ptr + x + 12)
+ }
+ };
+
+ // Add the offset terms to GEMM's result
+ in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+ in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+ in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+ in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+ // Store the result with the offset contribution
+ vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+ vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+ vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+ vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+ }
+
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
+ {
+ // Compute the leftover term due to a_offset.
+ const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
+
+ // Add the offset terms to GEMM's result
+ // Store the result with the offset contribution
+ mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+ }
},
vector_sum_col_it, mm_result_it);
}
@@ -365,23 +384,17 @@ void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITe
}
// Configure kernel window
- auto win_config = validate_and_configure_window(mm_result->info(),
- vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
- vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
- a_offset, b_offset);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ Window win = calculate_max_window(*mm_result->info(), Steps());
+ Coordinates coord;
+ coord.set_num_dimensions(mm_result->info()->num_dimensions());
+ mm_result->info()->set_valid_region(ValidRegion(coord, mm_result->info()->tensor_shape()));
+ INEKernel::configure(win);
}
Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
int32_t a_offset, int32_t b_offset)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
- vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
- vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
- a_offset, b_offset)
- .first); // NOLINT
return Status{};
}
@@ -399,3 +412,4 @@ void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadI
run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
}
+} // namespace arm_compute