aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-11-30 14:31:13 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:17 +0000
commitc7f9b893b8edc5660542821e2d0508460bc40225 (patch)
tree594456a7da9335bebda56498cfbb39be3a9609a2
parent23ac91b6ba235e67847802d4b49e494fa5bedbb6 (diff)
downloadComputeLibrary-c7f9b893b8edc5660542821e2d0508460bc40225.tar.gz
COMPMID-722 - Support for vector-matrix in GEMMLowp (NEON)
This patch includes COMPMID-716 as well - Added vector-matrix case in NEGEMMLowpMatrixMultiplyKernel - Added benchmarks for NEON and OpenCL Change-Id: I715cd25e8668a4d6c8127e9a298a865e7713267f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111468 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h2
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp701
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp151
-rw-r--r--tests/benchmark/CL/GEMMLowp.cpp52
-rw-r--r--tests/benchmark/NEON/GEMMLowp.cpp28
-rw-r--r--tests/benchmark/fixtures/GEMMLowpFixture.h66
-rw-r--r--tests/datasets/LargeGEMMLowpDataset.h2
-rw-r--r--tests/datasets/SmallGEMMLowpDataset.h2
8 files changed, 780 insertions, 224 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 889bbca7f2..cc513ade10 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -99,6 +99,8 @@ private:
Tensor _workspace;
int32_t _a_offset;
int32_t _b_offset;
+ bool _run_vector_matrix_multiplication;
+ bool _dot_product_path;
};
}
#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 208a60ce27..a68a01f6a6 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -42,81 +42,439 @@ using namespace arm_compute;
namespace arm_compute
{
-class Coordinates;
-} // namespace arm_compute
-
namespace
{
-Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ if(id.x() > width_b)
+ {
+ return;
+ }
- TensorShape in0_shape = input0->tensor_shape();
- TensorShape in1_shape = input1->tensor_shape();
- TensorShape out_shape = output->tensor_shape();
+ // Note: Since the input are all positives, we can use uint32_t
+ // Accumulators for the block 0
+ uint32x4x4_t c0 =
+ {
+ {
+ vdupq_n_u32(0),
+ vdupq_n_u32(0),
+ vdupq_n_u32(0),
+ vdupq_n_u32(0)
+ }
+ };
- in0_shape.collapse(2);
- in1_shape.collapse(2);
- out_shape.collapse(2);
+ auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr());
+ auto vec_a_end_addr = vec_a + width_a;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+ // This for loop performs 8 accumulations
+ for(; vec_a <= (vec_a_end_addr - 8);)
+ {
+ const uint8x8_t a00_u8 = vld1_u8(vec_a);
+ const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+ const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+ const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+ const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+ const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+ const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+ const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+ const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+ // Convert a00_u8 to uint16_t and get the lower part
+ const uint16x4x2_t a00_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(a00_u8)),
+ vget_high_u16(vmovl_u8(a00_u8))
+ }
+ };
- return Error{};
-}
+ const uint16x4x4_t b00_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+ }
+ };
-std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
- constexpr unsigned int num_elems_processed_per_iteration_x = 16;
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ const uint16x4x4_t b10_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
+ }
+ };
- Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ const uint16x4x4_t b20_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
+ }
+ };
- AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
- AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ const uint16x4x4_t b30_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
+ }
+ };
- bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+ const uint16x4x4_t b40_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
+ }
+ };
- output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+ const uint16x4x4_t b50_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
+ }
+ };
- Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
- return std::make_pair(err, win);
-}
-} // namespace
+ const uint16x4x4_t b60_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
+ }
+ };
-NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
-{
-}
+ const uint16x4x4_t b70_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
+ }
+ };
-void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+ // Accumulate 1:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+ // Accumulate 2:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+ // Accumulate 3:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+ // Accumulate 4:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+ // Accumulate 5:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+ // Accumulate 6:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+ // Accumulate 7:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+ vec_a += 8;
+ matrix_b += 8 * stride_b;
+ }
- TensorShape in1_shape = input1->info()->tensor_shape();
- in1_shape.collapse(2);
+ // This for loop performs the left-over accumulations
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const uint8x8_t a00_u8 = vld1_dup_u8(vec_a);
+ const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _slide_matrix_b = in1_shape[2] != 1;
+ const uint16x4x4_t b00_u16 =
+ {
+ {
+ vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+ vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+ vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+ }
+ };
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ // Convert a00_u8 to uint16_t and get the lower part
+ const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+ c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+ c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+ c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+ vec_a += 1;
+ matrix_b += stride_b;
+ }
+
+ auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+ vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+ vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+ vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+ vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+ },
+ ina, inb, out);
}
-Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ if(id.x() > width_b)
+ {
+ return;
+ }
- return Error{};
+ // Accumulators for the block 0
+ int32x4x4_t c0 =
+ {
+ {
+ vdupq_n_s32(0),
+ vdupq_n_s32(0),
+ vdupq_n_s32(0),
+ vdupq_n_s32(0)
+ }
+ };
+
+ auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr());
+ auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr());
+ auto vec_a_end_addr = vec_a + width_a;
+
+ // This for loop performs 8 accumulations
+ for(; vec_a <= (vec_a_end_addr - 8);)
+ {
+ const int8x8_t a00_s8 = vld1_s8(vec_a);
+ const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+ const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+ const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+ const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+ const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+ const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+ const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+ const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+ // Convert a00_s8 to int16_t and get the lower part
+ const int16x4x2_t a00_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(a00_s8)),
+ vget_high_s16(vmovl_s8(a00_s8))
+ }
+ };
+
+ const int16x4x4_t b00_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+ }
+ };
+
+ const int16x4x4_t b10_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
+ }
+ };
+
+ const int16x4x4_t b20_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
+ }
+ };
+
+ const int16x4x4_t b30_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
+ }
+ };
+
+ const int16x4x4_t b40_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
+ }
+ };
+
+ const int16x4x4_t b50_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
+ }
+ };
+
+ const int16x4x4_t b60_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
+ }
+ };
+
+ const int16x4x4_t b70_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
+ }
+ };
+
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+ // Accumulate 1:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+ // Accumulate 2:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+ // Accumulate 3:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+ // Accumulate 4:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+ // Accumulate 5:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+ // Accumulate 6:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+ // Accumulate 7:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+ vec_a += 8;
+ matrix_b += 8 * stride_b;
+ }
+
+ // This for loop performs the left-over accumulations
+ for(; vec_a < vec_a_end_addr;)
+ {
+ const int8x8_t a00_s8 = vld1_dup_s8(vec_a);
+ const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+
+ const int16x4x4_t b00_s16 =
+ {
+ {
+ vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+ vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+ vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+ }
+ };
+
+ // Convert a00_s8 to uint16_t and get the lower part
+ const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+ // Accumulate 0:
+ c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+ c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+ c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+ c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+ vec_a += 1;
+ matrix_b += stride_b;
+ }
+
+ auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+ vst1q_s32(vec_out + 0, c0.val[0]);
+ vst1q_s32(vec_out + 4, c0.val[1]);
+ vst1q_s32(vec_out + 8, c0.val[2]);
+ vst1q_s32(vec_out + 12, c0.val[3]);
+ },
+ ina, inb, out);
}
void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
@@ -176,7 +534,7 @@ void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int
const uint8x8_t a00_u8 = vld1_u8(mtx_a0);
const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
- // Convert a00_s8 to uint16_t and get the lower part
+ // Convert a00_u8 to uint16_t and get the lower part
const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
// Convert b00_s8 to uint16_t
@@ -355,55 +713,222 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
},
ina, inb, out);
}
+} // namespace
-void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+class Coordinates;
+} // namespace arm_compute
+
+namespace
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+ TensorShape in0_shape = input0->tensor_shape();
+ TensorShape in1_shape = input1->tensor_shape();
+ TensorShape out_shape = output->tensor_shape();
+
+ // Check vector-by-matrix case
+ if(out_shape[1] == 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+ }
+ else
+ {
+ in0_shape.collapse(2);
+ in1_shape.collapse(2);
+ out_shape.collapse(2);
- const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
- const size_t out_stride = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+ }
+
+ return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+ constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+ Window win;
+ bool window_changed = false;
- // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
- Window win_a(window);
- win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+ // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+ if((output->dimension(1) == 1))
+ {
+ // Configure kernel window
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+ // We cannot read out-of-bound elements from matrix A as we use the left-over for loop
+ AccessWindowStatic in0_access(input0, 0, 0, input0->tensor_shape().x(), 1);
+ AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+ AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+ window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
- // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
- Window win_b;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(_slide_matrix_b)
+ Coordinates coord;
+ coord.set_num_dimensions(output->num_dimensions());
+ output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+ }
+ else
{
- win_b = window;
+ win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
+ AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+ AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
}
- win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
- win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
- // The step x and step y for the output matrix has been already set using in configure()
- Iterator ina(_input0, win_a);
- Iterator inb(_input1, win_b);
- Iterator out(_output, window);
+ Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+
+ TensorShape in1_shape = input1->info()->tensor_shape();
+ in1_shape.collapse(2);
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _slide_matrix_b = in1_shape[2] != 1;
- const int width_b = _input1->info()->dimension(0);
- switch(_input0->info()->data_type())
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+
+ return Error{};
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
+ if((_output->info()->dimension(1) == 1))
{
- case DataType::S8:
+ const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
+ const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
+ const auto in_b_stride = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
+
+ // The implementation computes 16 elements per iteration
+ const int window_start_x = 16 * info.thread_id;
+ const int window_step_x = 16 * info.num_threads;
+ // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+ const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(_input1->info()->num_dimensions() >= 3)
{
- matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
- break;
+ win_b = window;
}
- case DataType::U8:
- case DataType::QASYMM8:
+ win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+ win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ Iterator ina(_input0, win_a);
+ Iterator inb(_input1, win_b);
+ Iterator out(_output, win_out);
+
+ switch(_input0->info()->data_type())
{
- matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
- break;
+ case DataType::S8:
+ {
+ vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+ break;
+ }
+ case DataType::U8:
+ case DataType::QASYMM8:
+ {
+ vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
}
- default:
+ }
+ else
+ {
+ const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
+ const size_t out_stride = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+
+ // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+ Window win_a(window);
+ win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+ win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+
+ // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
+ Window win_b;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(_slide_matrix_b)
{
- ARM_COMPUTE_ERROR("Not supported");
- break;
+ win_b = window;
+ }
+ win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
+ win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ // The step x and step y for the output matrix has been already set using in configure()
+ Iterator ina(_input0, win_a);
+ Iterator inb(_input1, win_b);
+ Iterator out(_output, window);
+
+ const int width_b = _input1->info()->dimension(0);
+ switch(_input0->info()->data_type())
+ {
+ case DataType::S8:
+ {
+ matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
+ break;
+ }
+ case DataType::U8:
+ case DataType::QASYMM8:
+ {
+ matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ break;
+ }
}
}
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index da5ac22fdc..2c6515c1df 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0)
+ _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
{
}
@@ -57,10 +57,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info()));
- bool dot_product_path = false;
-
- _a_offset = a->info()->quantization_info().offset;
- _b_offset = b->info()->quantization_info().offset;
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
+ _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
#ifdef ARM_COMPUTE_AARCH64_V8_2
// Check for DOT product instruction
@@ -69,7 +68,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
if(cpu_has_dotprod != 0)
{
- dot_product_path = true;
+ _dot_product_path = true;
// Configure matrix multiply kernel
struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -90,42 +89,54 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
else
#endif /* ARM_COMPUTE_AARCH64_V8_2 */
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
- _memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
- // Configure interleave kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
- _mtx_a_reshape_kernel = std::move(k);
- }
-
- // Configure transpose kernel
+ if(_run_vector_matrix_multiplication)
{
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
- k->configure(b, &_tmp_b);
- _mtx_b_reshape_kernel = std::move(k);
+ // Configure matrix multiply kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(a, b, output);
+ _mm_kernel = std::move(k);
+ }
}
-
- // Configure matrix multiply kernel
+ else
{
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(&_tmp_a, &_tmp_b, output);
- _mm_kernel = std::move(k);
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
+ // Configure interleave kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+ k->configure(a, &_tmp_a);
+ _mtx_a_reshape_kernel = std::move(k);
+ }
+
+ // Configure transpose kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+ k->configure(b, &_tmp_b);
+ _mtx_b_reshape_kernel = std::move(k);
+ }
+
+ // Configure matrix multiply kernel
+ {
+ auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+ k->configure(&_tmp_a, &_tmp_b, output);
+ _mm_kernel = std::move(k);
+ }
}
}
@@ -166,7 +177,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
// Allocate tensors
- if(!dot_product_path)
+ if(!_dot_product_path && !_run_vector_matrix_multiplication)
{
_tmp_a.allocator()->allocate();
_tmp_b.allocator()->allocate();
@@ -199,8 +210,9 @@ Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensor
ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
"The output matrix must have the same number of columns as the matrix B");
- int32_t a_offset = a->quantization_info().offset;
- int32_t b_offset = b->quantization_info().offset;
+ int32_t a_offset = a->quantization_info().offset;
+ int32_t b_offset = b->quantization_info().offset;
+ bool run_vector_matrix_multiplication = a->dimension(1) < 2;
#ifdef ARM_COMPUTE_AARCH64_V8_2
// Check for DOT product instruction
@@ -215,22 +227,29 @@ Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensor
else
#endif /* ARM_COMPUTE_AARCH64_V8_2 */
{
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+ if(!run_vector_matrix_multiplication)
+ {
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+ }
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -271,14 +290,18 @@ void NEGEMMLowpMatrixMultiplyCore::run()
{
_memory_group.acquire();
- if(_mtx_a_reshape_kernel)
+ // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
+ if(!_run_vector_matrix_multiplication && !_dot_product_path)
{
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
- }
+ if(_mtx_a_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
- if(_mtx_b_reshape_kernel)
- {
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ if(_mtx_b_reshape_kernel)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ }
}
NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
diff --git a/tests/benchmark/CL/GEMMLowp.cpp b/tests/benchmark/CL/GEMMLowp.cpp
new file mode 100644
index 0000000000..039695b18f
--- /dev/null
+++ b/tests/benchmark/CL/GEMMLowp.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/benchmark/fixtures/GEMMLowpFixture.h"
+#include "tests/datasets/GoogleNetGEMMDataset.h"
+#include "tests/datasets/MatrixMultiplyGEMMDataset.h"
+#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+using CLGEMMLowpFixture = GEMMLowpMatrixMultiplyCoreFixture<CLTensor, CLGEMMLowpMatrixMultiplyCore, CLAccessor>;
+
+TEST_SUITE(CL)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1GEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::ALL, datasets::GoogLeNetInceptionV1GEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(MatrixMultiplyGEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::ALL, datasets::MatrixMultiplyGEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogleNetGEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::NIGHTLY, datasets::GoogleNetGEMMDataset());
+
+TEST_SUITE_END()
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/NEON/GEMMLowp.cpp b/tests/benchmark/NEON/GEMMLowp.cpp
index a0e5e694bd..e3ad1d43d2 100644
--- a/tests/benchmark/NEON/GEMMLowp.cpp
+++ b/tests/benchmark/NEON/GEMMLowp.cpp
@@ -23,42 +23,30 @@
*/
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "tests/NEON/Accessor.h"
#include "tests/benchmark/fixtures/GEMMLowpFixture.h"
+#include "tests/datasets/GoogleNetGEMMDataset.h"
+#include "tests/datasets/MatrixMultiplyGEMMDataset.h"
+#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h"
#include "tests/framework/Macros.h"
#include "tests/framework/datasets/Datasets.h"
#include "utils/TypePrinter.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-#include "tests/NEON/Helper.h"
-
namespace arm_compute
{
namespace test
{
-const auto data_int_blk = framework::dataset::make("M", 800) * framework::dataset::make("N", 800) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9);
+using NEGEMMLowpFixture = GEMMLowpMatrixMultiplyCoreFixture<Tensor, NEGEMMLowpMatrixMultiplyCore, Accessor>;
TEST_SUITE(NEON)
-TEST_SUITE(INTERLEAVE_BLOCKED)
-using NEInterleaveBlocked = NESynthetizeFunction<NEGEMMInterleaveBlockedKernel>;
-using NEGEMMInterleaveBlockedFixture = GEMMInterleaveBlockedFixture<Tensor, NEInterleaveBlocked, Accessor>;
-REGISTER_FIXTURE_DATA_TEST_CASE(InterleaveBlocked, NEGEMMInterleaveBlockedFixture, framework::DatasetMode::ALL, data_int_blk);
-TEST_SUITE_END()
-
-#if 0 //FIXME: enable when we update NEGEMMLowp interface to work without offsets
-TEST_SUITE(U32)
-using NEGEMMLowpFixture = GEMMLowpFixture<Tensor, NEGEMMLowp, Accessor>;
-REGISTER_FIXTURE_DATA_TEST_CASE(GEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, framework::dataset::make("M", 100, 120) * framework::dataset::make("N", 100,
- 110)
- * framework::dataset::make("K", 16, 20));
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1GEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, datasets::GoogLeNetInceptionV1GEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(MatrixMultiplyGEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, datasets::MatrixMultiplyGEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogleNetGEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::NIGHTLY, datasets::GoogleNetGEMMDataset());
TEST_SUITE_END()
-#endif // defined(__aarch64__)
-
-TEST_SUITE_END()
-
} // namespace test
} // namespace arm_compute
diff --git a/tests/benchmark/fixtures/GEMMLowpFixture.h b/tests/benchmark/fixtures/GEMMLowpFixture.h
index b640705990..4bd7dfd42f 100644
--- a/tests/benchmark/fixtures/GEMMLowpFixture.h
+++ b/tests/benchmark/fixtures/GEMMLowpFixture.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_TEST_GEMMFIXTURE
-#define ARM_COMPUTE_TEST_GEMMFIXTURE
+#ifndef ARM_COMPUTE_TEST_GEMMLOWPFIXTURE
+#define ARM_COMPUTE_TEST_GEMMLOWPFIXTURE
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
@@ -34,59 +34,26 @@ namespace arm_compute
{
namespace test
{
-template <typename TensorType, typename Function, typename Accessor, bool Transposed = false>
-class GEMMInterleaveBlockedFixture : public framework::Fixture
-{
-public:
- template <typename...>
- void setup(size_t x, size_t y, int int_by, int block)
- {
- const float interleave_by_f32 = int_by;
- const TensorShape shape_a(x, y);
- const TensorShape shape_b(static_cast<size_t>(x * interleave_by_f32), static_cast<size_t>(std::ceil(y / interleave_by_f32)));
- // Create tensors
- a = create_tensor<TensorType>(shape_a, DataType::U8, 1);
- b = create_tensor<TensorType>(shape_b, DataType::U8, 1);
-
- // Create and configure function
- f.configure(&a, &b, int_by, block, Transposed);
-
- // Allocate tensors
- a.allocator()->allocate();
- b.allocator()->allocate();
- }
- void run()
- {
- f.run();
- }
-
- void teardown()
- {
- a.allocator()->free();
- b.allocator()->free();
- }
-
-private:
- TensorType a{};
- TensorType b{};
- Function f{};
-};
-
/** Fixture that can be used for NEON and CL */
template <typename TensorType, typename Function, typename Accessor>
-class GEMMLowpFixture : public framework::Fixture
+class GEMMLowpMatrixMultiplyCoreFixture : public framework::Fixture
{
public:
template <typename...>
- void setup(size_t m, size_t n, size_t k)
+ void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape shape_dst, float alpha, float beta)
{
- const TensorShape shape_a(k, m);
- const TensorShape shape_b(n, k);
- const TensorShape shape_c(n, m);
+ // TODO (COMPMID-717): The interface used for GEMMLowp is the same one used for GEMM in order to re-use the datasets
+ // However the interface for both GEMM and GEMMLowp should be reworked in order to accepts only the 3 dimensions M, N and K
+ ARM_COMPUTE_UNUSED(shape_c);
+ ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(beta);
+
+ // Note: The offsets for matrix A and matrix B are set to 0 in order to skip the computation for the offset contribution
+
// Create tensors
- a = create_tensor<TensorType>(shape_a, DataType::U8, 1);
- b = create_tensor<TensorType>(shape_b, DataType::U8, 1);
- c = create_tensor<TensorType>(shape_c, DataType::U32, 1);
+ a = create_tensor<TensorType>(shape_a, DataType::QASYMM8, 1, 0, QuantizationInfo(1.0f / 255.0f, 0));
+ b = create_tensor<TensorType>(shape_b, DataType::QASYMM8, 1, 0, QuantizationInfo(1.0f / 255.0f, 0));
+ c = create_tensor<TensorType>(shape_dst, DataType::S32, 1, 0, QuantizationInfo(1.0f / 255.0f, 0));
// Create and configure function
gemmlowp.configure(&a, &b, &c);
@@ -99,7 +66,6 @@ public:
// Fill tensors
library->fill_tensor_uniform(Accessor(a), 0);
library->fill_tensor_uniform(Accessor(b), 1);
- library->fill_tensor_uniform(Accessor(c), 2);
}
void run()
{
@@ -122,4 +88,4 @@ private:
} // namespace test
} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMFIXTURE */
+#endif /* ARM_COMPUTE_TEST_GEMMLOWPFIXTURE */
diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h
index 87f879e70a..5c0230e262 100644
--- a/tests/datasets/LargeGEMMLowpDataset.h
+++ b/tests/datasets/LargeGEMMLowpDataset.h
@@ -42,7 +42,7 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset
public:
LargeGEMMLowpDataset()
{
- add_config(TensorShape(923U, 2U), TensorShape(871U, 923U), TensorShape(871U, 2U), 0, 0);
+ add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), 0, 0);
add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0);
add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3);
add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4);
diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index 1d4ab53be5..b6651bdb42 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h
@@ -42,7 +42,7 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset
public:
SmallGEMMLowpDataset()
{
- add_config(TensorShape(21U, 2U), TensorShape(43U, 21U), TensorShape(43U, 2U), 0, 0);
+ add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U), 0, 0);
add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0);
add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13);
add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4);