From c7f9b893b8edc5660542821e2d0508460bc40225 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Thu, 30 Nov 2017 14:31:13 +0000 Subject: COMPMID-722 - Support for vector-matrix in GEMMLowp (NEON) This patch includes COMPMID-716 as well - Added vector-matrix case in NEGEMMLowpMatrixMultiplyKernel - Added benchmarks for NEON and OpenCL Change-Id: I715cd25e8668a4d6c8127e9a298a865e7713267f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111468 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Georgios Pinitas --- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 2 + .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp | 701 ++++++++++++++++++--- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 151 +++-- tests/benchmark/CL/GEMMLowp.cpp | 52 ++ tests/benchmark/NEON/GEMMLowp.cpp | 28 +- tests/benchmark/fixtures/GEMMLowpFixture.h | 66 +- tests/datasets/LargeGEMMLowpDataset.h | 2 +- tests/datasets/SmallGEMMLowpDataset.h | 2 +- 8 files changed, 780 insertions(+), 224 deletions(-) create mode 100644 tests/benchmark/CL/GEMMLowp.cpp diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 889bbca7f2..cc513ade10 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -99,6 +99,8 @@ private: Tensor _workspace; int32_t _a_offset; int32_t _b_offset; + bool _run_vector_matrix_multiplication; + bool _dot_product_path; }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp index 208a60ce27..a68a01f6a6 100644 --- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -42,81 +42,439 @@ using namespace arm_compute; namespace arm_compute { -class Coordinates; -} // namespace arm_compute - namespace { -Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + execute_window_loop(window, [&](const Coordinates & id) + { + if(id.x() > width_b) + { + return; + } - TensorShape in0_shape = input0->tensor_shape(); - TensorShape in1_shape = input1->tensor_shape(); - TensorShape out_shape = output->tensor_shape(); + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = + { + { + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) + } + }; - in0_shape.collapse(2); - in1_shape.collapse(2); - out_shape.collapse(2); + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + // This for loop performs 8 accumulations + for(; vec_a <= (vec_a_end_addr - 8);) + { + const uint8x8_t a00_u8 = vld1_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); + const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); + const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); + const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); + const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); + const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); + const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); + const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4x2_t a00_u16 = + { + { + vget_low_u16(vmovl_u8(a00_u8)), + vget_high_u16(vmovl_u8(a00_u8)) + } + }; - return Error{}; -} + const uint16x4x4_t b00_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) + } + }; -std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) -{ - constexpr unsigned int num_elems_processed_per_iteration_x = 16; - constexpr unsigned int num_elems_processed_per_iteration_y = 4; + const uint16x4x4_t b10_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b10_u8))) + } + }; - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + const uint16x4x4_t b20_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b20_u8))) + } + }; - AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1)); - AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x); - AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + const uint16x4x4_t b30_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b30_u8))) + } + }; - bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access); + const uint16x4x4_t b40_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b40_u8))) + } + }; - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + const uint16x4x4_t b50_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b50_u8))) + } + }; - Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; - return std::make_pair(err, win); -} -} // namespace + const uint16x4x4_t b60_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b60_u8))) + } + }; -NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true) -{ -} + const uint16x4x4_t b70_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b70_u8))) + } + }; -void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } - TensorShape in1_shape = input1->info()->tensor_shape(); - in1_shape.collapse(2); + // This for loop performs the left-over accumulations + for(; vec_a < vec_a_end_addr;) + { + const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b); - _input0 = input0; - _input1 = input1; - _output = output; - _slide_matrix_b = in1_shape[2] != 1; + const uint16x4x4_t b00_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) + } + }; - // Configure kernel window - auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - INEKernel::configure(win_config.second); + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + vec_a += 1; + matrix_b += stride_b; + } + + auto vec_out = reinterpret_cast(out.ptr()); + vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); + }, + ina, inb, out); } -Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); + execute_window_loop(window, [&](const Coordinates & id) + { + if(id.x() > width_b) + { + return; + } - return Error{}; + // Accumulators for the block 0 + int32x4x4_t c0 = + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0) + } + }; + + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for(; vec_a <= (vec_a_end_addr - 8);) + { + const int8x8_t a00_s8 = vld1_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); + const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); + const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); + const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); + const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); + const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); + const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); + const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); + + // Convert a00_s8 to int16_t and get the lower part + const int16x4x2_t a00_s16 = + { + { + vget_low_s16(vmovl_s8(a00_s8)), + vget_high_s16(vmovl_s8(a00_s8)) + } + }; + + const int16x4x4_t b00_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) + } + }; + + const int16x4x4_t b10_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b10_s8))) + } + }; + + const int16x4x4_t b20_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b20_s8))) + } + }; + + const int16x4x4_t b30_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b30_s8))) + } + }; + + const int16x4x4_t b40_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b40_s8))) + } + }; + + const int16x4x4_t b50_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b50_s8))) + } + }; + + const int16x4x4_t b60_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b60_s8))) + } + }; + + const int16x4x4_t b70_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b70_s8))) + } + }; + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } + + // This for loop performs the left-over accumulations + for(; vec_a < vec_a_end_addr;) + { + const int8x8_t a00_s8 = vld1_dup_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b); + + const int16x4x4_t b00_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) + } + }; + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + vec_a += 1; + matrix_b += stride_b; + } + + auto vec_out = reinterpret_cast(out.ptr()); + vst1q_s32(vec_out + 0, c0.val[0]); + vst1q_s32(vec_out + 4, c0.val[1]); + vst1q_s32(vec_out + 8, c0.val[2]); + vst1q_s32(vec_out + 12, c0.val[3]); + }, + ina, inb, out); } void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window) @@ -176,7 +534,7 @@ void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int const uint8x8_t a00_u8 = vld1_u8(mtx_a0); const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); - // Convert a00_s8 to uint16_t and get the lower part + // Convert a00_u8 to uint16_t and get the lower part const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); // Convert b00_s8 to uint16_t @@ -355,55 +713,222 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int }, ina, inb, out); } +} // namespace -void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info) +class Coordinates; +} // namespace arm_compute + +namespace { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); +Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); + + TensorShape in0_shape = input0->tensor_shape(); + TensorShape in1_shape = input1->tensor_shape(); + TensorShape out_shape = output->tensor_shape(); + + // Check vector-by-matrix case + if(out_shape[1] == 1) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows"); + } + else + { + in0_shape.collapse(2); + in1_shape.collapse(2); + out_shape.collapse(2); - const size_t in_b_stride = _input1->info()->strides_in_bytes()[1]; - const size_t out_stride = _output->info()->strides_in_bytes()[1] / _output->info()->element_size(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + } + + return Error{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win; + bool window_changed = false; - // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1)); + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if((output->dimension(1) == 1)) + { + // Configure kernel window + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x)); + + // We cannot read out-of-bound elements from matrix A as we use the left-over for loop + AccessWindowStatic in0_access(input0, 0, 0, input0->tensor_shape().x(), 1); + AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x); + + window_changed = update_window_and_padding(win, in0_access, in1_access, output_access); - // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(_slide_matrix_b) + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); + } + else { - win_b = window; + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1)); + AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, in0_access, in1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); } - win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride)); - win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); - // The step x and step y for the output matrix has been already set using in configure() - Iterator ina(_input0, win_a); - Iterator inb(_input1, win_b); - Iterator out(_output, window); + Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{}; + return std::make_pair(err, win); +} +} // namespace + +NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true) +{ +} + +void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); + + TensorShape in1_shape = input1->info()->tensor_shape(); + in1_shape.collapse(2); + + _input0 = input0; + _input1 = input1; + _output = output; + _slide_matrix_b = in1_shape[2] != 1; - const int width_b = _input1->info()->dimension(0); - switch(_input0->info()->data_type()) + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first); + + return Error{}; +} + +void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path + if((_output->info()->dimension(1) == 1)) { - case DataType::S8: + const auto width_matrix_a = static_cast(_input0->info()->dimension(0)); + const auto width_matrix_b = static_cast(_input1->info()->dimension(0)); + const auto in_b_stride = static_cast(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type())); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_input1->info()->num_dimensions() >= 3) { - matrix_multiply_s8(ina, inb, out, width_b, out_stride, window); - break; + win_b = window; } - case DataType::U8: - case DataType::QASYMM8: + win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(_input0, win_a); + Iterator inb(_input1, win_b); + Iterator out(_output, win_out); + + switch(_input0->info()->data_type()) { - matrix_multiply_u8(ina, inb, out, width_b, out_stride, window); - break; + case DataType::S8: + { + vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window); + break; + } + case DataType::U8: + case DataType::QASYMM8: + { + vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + break; + } } - default: + } + else + { + const size_t in_b_stride = _input1->info()->strides_in_bytes()[1]; + const size_t out_stride = _output->info()->strides_in_bytes()[1] / _output->info()->element_size(); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1)); + + // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_slide_matrix_b) { - ARM_COMPUTE_ERROR("Not supported"); - break; + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // The step x and step y for the output matrix has been already set using in configure() + Iterator ina(_input0, win_a); + Iterator inb(_input1, win_b); + Iterator out(_output, window); + + const int width_b = _input1->info()->dimension(0); + switch(_input0->info()->data_type()) + { + case DataType::S8: + { + matrix_multiply_s8(ina, inb, out, width_b, out_stride, window); + break; + } + case DataType::U8: + case DataType::QASYMM8: + { + matrix_multiply_u8(ina, inb, out, width_b, out_stride, window); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + break; + } } } } diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index da5ac22fdc..2c6515c1df 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -48,7 +48,7 @@ using namespace arm_compute; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), - _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0) + _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false) { } @@ -57,10 +57,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info())); - bool dot_product_path = false; - - _a_offset = a->info()->quantization_info().offset; - _b_offset = b->info()->quantization_info().offset; + _a_offset = a->info()->quantization_info().offset; + _b_offset = b->info()->quantization_info().offset; + _run_vector_matrix_multiplication = a->info()->dimension(1) < 2; #ifdef ARM_COMPUTE_AARCH64_V8_2 // Check for DOT product instruction @@ -69,7 +68,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, if(cpu_has_dotprod != 0) { - dot_product_path = true; + _dot_product_path = true; // Configure matrix multiply kernel struct CPUInfo ci = NEScheduler::get().cpu_info(); @@ -90,42 +89,54 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, else #endif /* ARM_COMPUTE_AARCH64_V8_2 */ { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->info()->tensor_shape(); - shape_tmp_a.set(0, a->info()->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->info()->tensor_shape(); - shape_tmp_b.set(0, b->info()->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); - _tmp_a.allocator()->init(info_a); - _tmp_b.allocator()->init(info_b); - _memory_group.manage(&_tmp_a); - _memory_group.manage(&_tmp_b); - - // Configure interleave kernel - { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(a, &_tmp_a); - _mtx_a_reshape_kernel = std::move(k); - } - - // Configure transpose kernel + if(_run_vector_matrix_multiplication) { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(b, &_tmp_b); - _mtx_b_reshape_kernel = std::move(k); + // Configure matrix multiply kernel + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(a, b, output); + _mm_kernel = std::move(k); + } } - - // Configure matrix multiply kernel + else { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(&_tmp_a, &_tmp_b, output); - _mm_kernel = std::move(k); + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->info()->tensor_shape(); + shape_tmp_a.set(0, a->info()->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->info()->tensor_shape(); + shape_tmp_b.set(0, b->info()->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f)); + + TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type()); + TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type()); + _tmp_a.allocator()->init(info_a); + _tmp_b.allocator()->init(info_b); + _memory_group.manage(&_tmp_a); + _memory_group.manage(&_tmp_b); + + // Configure interleave kernel + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(a, &_tmp_a); + _mtx_a_reshape_kernel = std::move(k); + } + + // Configure transpose kernel + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(b, &_tmp_b); + _mtx_b_reshape_kernel = std::move(k); + } + + // Configure matrix multiply kernel + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(&_tmp_a, &_tmp_b, output); + _mm_kernel = std::move(k); + } } } @@ -166,7 +177,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset); // Allocate tensors - if(!dot_product_path) + if(!_dot_product_path && !_run_vector_matrix_multiplication) { _tmp_a.allocator()->allocate(); _tmp_b.allocator()->allocate(); @@ -199,8 +210,9 @@ Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0), "The output matrix must have the same number of columns as the matrix B"); - int32_t a_offset = a->quantization_info().offset; - int32_t b_offset = b->quantization_info().offset; + int32_t a_offset = a->quantization_info().offset; + int32_t b_offset = b->quantization_info().offset; + bool run_vector_matrix_multiplication = a->dimension(1) < 2; #ifdef ARM_COMPUTE_AARCH64_V8_2 // Check for DOT product instruction @@ -215,22 +227,29 @@ Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensor else #endif /* ARM_COMPUTE_AARCH64_V8_2 */ { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - TensorInfo info_a(shape_tmp_a, 1, a->data_type()); - TensorInfo info_b(shape_tmp_b, 1, b->data_type()); - - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); + if(!run_vector_matrix_multiplication) + { + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + TensorInfo info_a(shape_tmp_a, 1, a->data_type()); + TensorInfo info_b(shape_tmp_b, 1, b->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output)); + } } TensorInfo info_vector_sum_col, info_vector_sum_row; @@ -271,14 +290,18 @@ void NEGEMMLowpMatrixMultiplyCore::run() { _memory_group.acquire(); - if(_mtx_a_reshape_kernel) + // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction + if(!_run_vector_matrix_multiplication && !_dot_product_path) { - NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); - } + if(_mtx_a_reshape_kernel) + { + NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY); + } - if(_mtx_b_reshape_kernel) - { - NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + if(_mtx_b_reshape_kernel) + { + NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); + } } NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY); diff --git a/tests/benchmark/CL/GEMMLowp.cpp b/tests/benchmark/CL/GEMMLowp.cpp new file mode 100644 index 0000000000..039695b18f --- /dev/null +++ b/tests/benchmark/CL/GEMMLowp.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "tests/CL/CLAccessor.h" +#include "tests/benchmark/fixtures/GEMMLowpFixture.h" +#include "tests/datasets/GoogleNetGEMMDataset.h" +#include "tests/datasets/MatrixMultiplyGEMMDataset.h" +#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace test +{ +using CLGEMMLowpFixture = GEMMLowpMatrixMultiplyCoreFixture; + +TEST_SUITE(CL) + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1GEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::ALL, datasets::GoogLeNetInceptionV1GEMMDataset()); +REGISTER_FIXTURE_DATA_TEST_CASE(MatrixMultiplyGEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::ALL, datasets::MatrixMultiplyGEMMDataset()); +REGISTER_FIXTURE_DATA_TEST_CASE(GoogleNetGEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::NIGHTLY, datasets::GoogleNetGEMMDataset()); + +TEST_SUITE_END() +} // namespace test +} // namespace arm_compute diff --git a/tests/benchmark/NEON/GEMMLowp.cpp b/tests/benchmark/NEON/GEMMLowp.cpp index a0e5e694bd..e3ad1d43d2 100644 --- a/tests/benchmark/NEON/GEMMLowp.cpp +++ b/tests/benchmark/NEON/GEMMLowp.cpp @@ -23,42 +23,30 @@ */ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/TensorAllocator.h" #include "tests/NEON/Accessor.h" #include "tests/benchmark/fixtures/GEMMLowpFixture.h" +#include "tests/datasets/GoogleNetGEMMDataset.h" +#include "tests/datasets/MatrixMultiplyGEMMDataset.h" +#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" #include "utils/TypePrinter.h" -#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h" -#include "tests/NEON/Helper.h" - namespace arm_compute { namespace test { -const auto data_int_blk = framework::dataset::make("M", 800) * framework::dataset::make("N", 800) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9); +using NEGEMMLowpFixture = GEMMLowpMatrixMultiplyCoreFixture; TEST_SUITE(NEON) -TEST_SUITE(INTERLEAVE_BLOCKED) -using NEInterleaveBlocked = NESynthetizeFunction; -using NEGEMMInterleaveBlockedFixture = GEMMInterleaveBlockedFixture; -REGISTER_FIXTURE_DATA_TEST_CASE(InterleaveBlocked, NEGEMMInterleaveBlockedFixture, framework::DatasetMode::ALL, data_int_blk); -TEST_SUITE_END() - -#if 0 //FIXME: enable when we update NEGEMMLowp interface to work without offsets -TEST_SUITE(U32) -using NEGEMMLowpFixture = GEMMLowpFixture; -REGISTER_FIXTURE_DATA_TEST_CASE(GEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, framework::dataset::make("M", 100, 120) * framework::dataset::make("N", 100, - 110) - * framework::dataset::make("K", 16, 20)); +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1GEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, datasets::GoogLeNetInceptionV1GEMMDataset()); +REGISTER_FIXTURE_DATA_TEST_CASE(MatrixMultiplyGEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, datasets::MatrixMultiplyGEMMDataset()); +REGISTER_FIXTURE_DATA_TEST_CASE(GoogleNetGEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::NIGHTLY, datasets::GoogleNetGEMMDataset()); TEST_SUITE_END() -#endif // defined(__aarch64__) - -TEST_SUITE_END() - } // namespace test } // namespace arm_compute diff --git a/tests/benchmark/fixtures/GEMMLowpFixture.h b/tests/benchmark/fixtures/GEMMLowpFixture.h index b640705990..4bd7dfd42f 100644 --- a/tests/benchmark/fixtures/GEMMLowpFixture.h +++ b/tests/benchmark/fixtures/GEMMLowpFixture.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_GEMMFIXTURE -#define ARM_COMPUTE_TEST_GEMMFIXTURE +#ifndef ARM_COMPUTE_TEST_GEMMLOWPFIXTURE +#define ARM_COMPUTE_TEST_GEMMLOWPFIXTURE #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" @@ -34,59 +34,26 @@ namespace arm_compute { namespace test { -template -class GEMMInterleaveBlockedFixture : public framework::Fixture -{ -public: - template - void setup(size_t x, size_t y, int int_by, int block) - { - const float interleave_by_f32 = int_by; - const TensorShape shape_a(x, y); - const TensorShape shape_b(static_cast(x * interleave_by_f32), static_cast(std::ceil(y / interleave_by_f32))); - // Create tensors - a = create_tensor(shape_a, DataType::U8, 1); - b = create_tensor(shape_b, DataType::U8, 1); - - // Create and configure function - f.configure(&a, &b, int_by, block, Transposed); - - // Allocate tensors - a.allocator()->allocate(); - b.allocator()->allocate(); - } - void run() - { - f.run(); - } - - void teardown() - { - a.allocator()->free(); - b.allocator()->free(); - } - -private: - TensorType a{}; - TensorType b{}; - Function f{}; -}; - /** Fixture that can be used for NEON and CL */ template -class GEMMLowpFixture : public framework::Fixture +class GEMMLowpMatrixMultiplyCoreFixture : public framework::Fixture { public: template - void setup(size_t m, size_t n, size_t k) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape shape_dst, float alpha, float beta) { - const TensorShape shape_a(k, m); - const TensorShape shape_b(n, k); - const TensorShape shape_c(n, m); + // TODO (COMPMID-717): The interface used for GEMMLowp is the same one used for GEMM in order to re-use the datasets + // However the interface for both GEMM and GEMMLowp should be reworked in order to accepts only the 3 dimensions M, N and K + ARM_COMPUTE_UNUSED(shape_c); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + + // Note: The offsets for matrix A and matrix B are set to 0 in order to skip the computation for the offset contribution + // Create tensors - a = create_tensor(shape_a, DataType::U8, 1); - b = create_tensor(shape_b, DataType::U8, 1); - c = create_tensor(shape_c, DataType::U32, 1); + a = create_tensor(shape_a, DataType::QASYMM8, 1, 0, QuantizationInfo(1.0f / 255.0f, 0)); + b = create_tensor(shape_b, DataType::QASYMM8, 1, 0, QuantizationInfo(1.0f / 255.0f, 0)); + c = create_tensor(shape_dst, DataType::S32, 1, 0, QuantizationInfo(1.0f / 255.0f, 0)); // Create and configure function gemmlowp.configure(&a, &b, &c); @@ -99,7 +66,6 @@ public: // Fill tensors library->fill_tensor_uniform(Accessor(a), 0); library->fill_tensor_uniform(Accessor(b), 1); - library->fill_tensor_uniform(Accessor(c), 2); } void run() { @@ -122,4 +88,4 @@ private: } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_GEMMFIXTURE */ +#endif /* ARM_COMPUTE_TEST_GEMMLOWPFIXTURE */ diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h index 87f879e70a..5c0230e262 100644 --- a/tests/datasets/LargeGEMMLowpDataset.h +++ b/tests/datasets/LargeGEMMLowpDataset.h @@ -42,7 +42,7 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset public: LargeGEMMLowpDataset() { - add_config(TensorShape(923U, 2U), TensorShape(871U, 923U), TensorShape(871U, 2U), 0, 0); + add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), 0, 0); add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0); add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3); add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4); diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h index 1d4ab53be5..b6651bdb42 100644 --- a/tests/datasets/SmallGEMMLowpDataset.h +++ b/tests/datasets/SmallGEMMLowpDataset.h @@ -42,7 +42,7 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset public: SmallGEMMLowpDataset() { - add_config(TensorShape(21U, 2U), TensorShape(43U, 21U), TensorShape(43U, 2U), 0, 0); + add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U), 0, 0); add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0); add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13); add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4); -- cgit v1.2.1