From c7f9b893b8edc5660542821e2d0508460bc40225 Mon Sep 17 00:00:00 2001
From: Gian Marco <gianmarco.iodice@arm.com>
Date: Thu, 30 Nov 2017 14:31:13 +0000
Subject: COMPMID-722 - Support for vector-matrix in GEMMLowp (NEON)

This patch includes COMPMID-716 as well

- Added vector-matrix case in NEGEMMLowpMatrixMultiplyKernel
- Added benchmarks for NEON and OpenCL

Change-Id: I715cd25e8668a4d6c8127e9a298a865e7713267f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111468
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  |   2 +
 .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp     | 701 ++++++++++++++++++---
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     | 151 +++--
 tests/benchmark/CL/GEMMLowp.cpp                    |  52 ++
 tests/benchmark/NEON/GEMMLowp.cpp                  |  28 +-
 tests/benchmark/fixtures/GEMMLowpFixture.h         |  66 +-
 tests/datasets/LargeGEMMLowpDataset.h              |   2 +-
 tests/datasets/SmallGEMMLowpDataset.h              |   2 +-
 8 files changed, 780 insertions(+), 224 deletions(-)
 create mode 100644 tests/benchmark/CL/GEMMLowp.cpp
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 889bbca7f2..cc513ade10 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -99,6 +99,8 @@ private:
     Tensor                             _workspace;
     int32_t                            _a_offset;
     int32_t                            _b_offset;
+    bool                               _run_vector_matrix_multiplication;
+    bool                               _dot_product_path;
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 208a60ce27..a68a01f6a6 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -42,81 +42,439 @@ using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
-Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(id.x() > width_b)
+        {
+            return;
+        }
 
-    TensorShape in0_shape = input0->tensor_shape();
-    TensorShape in1_shape = input1->tensor_shape();
-    TensorShape out_shape = output->tensor_shape();
+        // Note: Since the input are all positives, we can use uint32_t
+        // Accumulators for the block 0
+        uint32x4x4_t c0 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
 
-    in0_shape.collapse(2);
-    in1_shape.collapse(2);
-    out_shape.collapse(2);
+        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+        auto vec_a_end_addr = vec_a + width_a;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        // This for loop performs 8 accumulations
+        for(; vec_a <= (vec_a_end_addr - 8);)
+        {
+            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4x2_t a00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(a00_u8)),
+                    vget_high_u16(vmovl_u8(a00_u8))
+                }
+            };
 
-    return Error{};
-}
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
 
-std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+            const uint16x4x4_t b10_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
+                }
+            };
 
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+            const uint16x4x4_t b20_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
+                }
+            };
 
-    AccessWindowStatic     in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
-    AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
-    AccessWindowRectangle  output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+            const uint16x4x4_t b30_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
+                }
+            };
 
-    bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+            const uint16x4x4_t b40_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
+                }
+            };
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+            const uint16x4x4_t b50_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
+                }
+            };
 
-    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
-    return std::make_pair(err, win);
-}
-} // namespace
+            const uint16x4x4_t b60_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
+                }
+            };
 
-NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
-{
-}
+            const uint16x4x4_t b70_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
+                }
+            };
 
-void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+            // Accumulate 1:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+            // Accumulate 2:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+            // Accumulate 3:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+            // Accumulate 4:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+            // Accumulate 5:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+            // Accumulate 6:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+            // Accumulate 7:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+            vec_a += 8;
+            matrix_b += 8 * stride_b;
+        }
 
-    TensorShape in1_shape = input1->info()->tensor_shape();
-    in1_shape.collapse(2);
+        // This for loop performs the left-over accumulations
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
 
-    _input0         = input0;
-    _input1         = input1;
-    _output         = output;
-    _slide_matrix_b = in1_shape[2] != 1;
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+            vec_a += 1;
+            matrix_b += stride_b;
+        }
+
+        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+        vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+        vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+        vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+    },
+    ina, inb, out);
 }
 
-Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(id.x() > width_b)
+        {
+            return;
+        }
 
-    return Error{};
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
+
+        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+        auto vec_a_end_addr = vec_a + width_a;
+
+        // This for loop performs 8 accumulations
+        for(; vec_a <= (vec_a_end_addr - 8);)
+        {
+            const int8x8_t  a00_s8 = vld1_s8(vec_a);
+            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+            // Convert a00_s8 to int16_t and get the lower part
+            const int16x4x2_t a00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(a00_s8)),
+                    vget_high_s16(vmovl_s8(a00_s8))
+                }
+            };
+
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
+
+            const int16x4x4_t b10_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
+                }
+            };
+
+            const int16x4x4_t b20_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
+                }
+            };
+
+            const int16x4x4_t b30_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
+                }
+            };
+
+            const int16x4x4_t b40_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
+                }
+            };
+
+            const int16x4x4_t b50_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
+                }
+            };
+
+            const int16x4x4_t b60_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
+                }
+            };
+
+            const int16x4x4_t b70_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
+                }
+            };
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+            // Accumulate 1:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+            // Accumulate 2:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+            // Accumulate 3:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+            // Accumulate 4:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+            // Accumulate 5:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+            // Accumulate 6:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+            // Accumulate 7:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+            vec_a += 8;
+            matrix_b += 8 * stride_b;
+        }
+
+        // This for loop performs the left-over accumulations
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
+
+            // Convert a00_s8 to uint16_t and get the lower part
+            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+            vec_a += 1;
+            matrix_b += stride_b;
+        }
+
+        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(vec_out + 0, c0.val[0]);
+        vst1q_s32(vec_out + 4, c0.val[1]);
+        vst1q_s32(vec_out + 8, c0.val[2]);
+        vst1q_s32(vec_out + 12, c0.val[3]);
+    },
+    ina, inb, out);
 }
 
 void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
@@ -176,7 +534,7 @@ void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int
             const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
             const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
 
-            // Convert a00_s8 to uint16_t and get the lower part
+            // Convert a00_u8 to uint16_t and get the lower part
             const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
 
             // Convert b00_s8 to uint16_t
@@ -355,55 +713,222 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int
     },
     ina, inb, out);
 }
+} // namespace
 
-void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+class Coordinates;
+} // namespace arm_compute
+
+namespace
 {
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    TensorShape in0_shape = input0->tensor_shape();
+    TensorShape in1_shape = input1->tensor_shape();
+    TensorShape out_shape = output->tensor_shape();
+
+    // Check vector-by-matrix case
+    if(out_shape[1] == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+    }
+    else
+    {
+        in0_shape.collapse(2);
+        in1_shape.collapse(2);
+        out_shape.collapse(2);
 
-    const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
-    const size_t out_stride  = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+    }
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win;
+    bool   window_changed = false;
 
-    // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->dimension(1) == 1))
+    {
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+        // We cannot read out-of-bound elements from matrix A as we use the left-over for loop
+        AccessWindowStatic     in0_access(input0, 0, 0, input0->tensor_shape().x(), 1);
+        AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
 
-    // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
-    Window win_b;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(_slide_matrix_b)
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+    else
     {
-        win_b = window;
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic     in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
+        AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+        AccessWindowRectangle  output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
     }
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    // The step x and step y for the output matrix has been already set using in configure()
-    Iterator ina(_input0, win_a);
-    Iterator inb(_input1, win_b);
-    Iterator out(_output, window);
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+
+    TensorShape in1_shape = input1->info()->tensor_shape();
+    in1_shape.collapse(2);
+
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = in1_shape[2] != 1;
 
-    const int width_b = _input1->info()->dimension(0);
-    switch(_input0->info()->data_type())
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+
+    return Error{};
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
+    if((_output->info()->dimension(1) == 1))
     {
-        case DataType::S8:
+        const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
+        const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
+        const auto in_b_stride    = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
+
+        // The implementation computes 16 elements per iteration
+        const int window_start_x = 16 * info.thread_id;
+        const int window_step_x  = 16 * info.num_threads;
+        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+        Window win_out(window);
+        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() >= 3)
         {
-            matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
-            break;
+            win_b = window;
         }
-        case DataType::U8:
-        case DataType::QASYMM8:
+        win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator ina(_input0, win_a);
+        Iterator inb(_input1, win_b);
+        Iterator out(_output, win_out);
+
+        switch(_input0->info()->data_type())
         {
-            matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
-            break;
+            case DataType::S8:
+            {
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
         }
-        default:
+    }
+    else
+    {
+        const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
+        const size_t out_stride  = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+
+        // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+
+        // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_slide_matrix_b)
         {
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
+            win_b = window;
+        }
+        win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
+        win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        // The step x and step y for the output matrix has been already set using in configure()
+        Iterator ina(_input0, win_a);
+        Iterator inb(_input1, win_b);
+        Iterator out(_output, window);
+
+        const int width_b = _input1->info()->dimension(0);
+        switch(_input0->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
         }
     }
 }
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index da5ac22fdc..2c6515c1df 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 
 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
-      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0)
+      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
 {
 }
 
@@ -57,10 +57,9 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info()));
 
-    bool dot_product_path = false;
-
-    _a_offset = a->info()->quantization_info().offset;
-    _b_offset = b->info()->quantization_info().offset;
+    _a_offset                         = a->info()->quantization_info().offset;
+    _b_offset                         = b->info()->quantization_info().offset;
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
 #ifdef ARM_COMPUTE_AARCH64_V8_2
     // Check for DOT product instruction
@@ -69,7 +68,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
 
     if(cpu_has_dotprod != 0)
     {
-        dot_product_path = true;
+        _dot_product_path = true;
 
         // Configure matrix multiply kernel
         struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -90,42 +89,54 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     else
 #endif /* ARM_COMPUTE_AARCH64_V8_2 */
     {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-        _tmp_a.allocator()->init(info_a);
-        _tmp_b.allocator()->init(info_b);
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
-
-        // Configure interleave kernel
-        {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-            k->configure(a, &_tmp_a);
-            _mtx_a_reshape_kernel = std::move(k);
-        }
-
-        // Configure transpose kernel
+        if(_run_vector_matrix_multiplication)
         {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-            k->configure(b, &_tmp_b);
-            _mtx_b_reshape_kernel = std::move(k);
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(a, b, output);
+                _mm_kernel = std::move(k);
+            }
         }
-
-        // Configure matrix multiply kernel
+        else
         {
-            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-            k->configure(&_tmp_a, &_tmp_b, output);
-            _mm_kernel = std::move(k);
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
+            shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
+
+            // Configure interleave kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+                k->configure(a, &_tmp_a);
+                _mtx_a_reshape_kernel = std::move(k);
+            }
+
+            // Configure transpose kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+                k->configure(b, &_tmp_b);
+                _mtx_b_reshape_kernel = std::move(k);
+            }
+
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(&_tmp_a, &_tmp_b, output);
+                _mm_kernel = std::move(k);
+            }
         }
     }
 
@@ -166,7 +177,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
 
     // Allocate tensors
-    if(!dot_product_path)
+    if(!_dot_product_path && !_run_vector_matrix_multiplication)
     {
         _tmp_a.allocator()->allocate();
         _tmp_b.allocator()->allocate();
@@ -199,8 +210,9 @@ Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
                                     "The output matrix must have the same number of columns as the matrix B");
 
-    int32_t a_offset = a->quantization_info().offset;
-    int32_t b_offset = b->quantization_info().offset;
+    int32_t a_offset                         = a->quantization_info().offset;
+    int32_t b_offset                         = b->quantization_info().offset;
+    bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
 
 #ifdef ARM_COMPUTE_AARCH64_V8_2
     // Check for DOT product instruction
@@ -215,22 +227,29 @@ Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensor
     else
 #endif /* ARM_COMPUTE_AARCH64_V8_2 */
     {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->tensor_shape();
-        shape_tmp_a.set(0, a->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->tensor_shape();
-        shape_tmp_b.set(0, b->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        if(!run_vector_matrix_multiplication)
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->tensor_shape();
+            shape_tmp_a.set(0, a->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->tensor_shape();
+            shape_tmp_b.set(0, b->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        }
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -271,14 +290,18 @@ void NEGEMMLowpMatrixMultiplyCore::run()
 {
     _memory_group.acquire();
 
-    if(_mtx_a_reshape_kernel)
+    // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
+    if(!_run_vector_matrix_multiplication && !_dot_product_path)
     {
-        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-    }
+        if(_mtx_a_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+        }
 
-    if(_mtx_b_reshape_kernel)
-    {
-        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+        if(_mtx_b_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+        }
     }
 
     NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
diff --git a/tests/benchmark/CL/GEMMLowp.cpp b/tests/benchmark/CL/GEMMLowp.cpp
new file mode 100644
index 0000000000..039695b18f
--- /dev/null
+++ b/tests/benchmark/CL/GEMMLowp.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/benchmark/fixtures/GEMMLowpFixture.h"
+#include "tests/datasets/GoogleNetGEMMDataset.h"
+#include "tests/datasets/MatrixMultiplyGEMMDataset.h"
+#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+using CLGEMMLowpFixture = GEMMLowpMatrixMultiplyCoreFixture<CLTensor, CLGEMMLowpMatrixMultiplyCore, CLAccessor>;
+
+TEST_SUITE(CL)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1GEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::ALL, datasets::GoogLeNetInceptionV1GEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(MatrixMultiplyGEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::ALL, datasets::MatrixMultiplyGEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogleNetGEMMLowp, CLGEMMLowpFixture, framework::DatasetMode::NIGHTLY, datasets::GoogleNetGEMMDataset());
+
+TEST_SUITE_END()
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/NEON/GEMMLowp.cpp b/tests/benchmark/NEON/GEMMLowp.cpp
index a0e5e694bd..e3ad1d43d2 100644
--- a/tests/benchmark/NEON/GEMMLowp.cpp
+++ b/tests/benchmark/NEON/GEMMLowp.cpp
@@ -23,42 +23,30 @@
  */
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/benchmark/fixtures/GEMMLowpFixture.h"
+#include "tests/datasets/GoogleNetGEMMDataset.h"
+#include "tests/datasets/MatrixMultiplyGEMMDataset.h"
+#include "tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "utils/TypePrinter.h"
 
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-#include "tests/NEON/Helper.h"
-
 namespace arm_compute
 {
 namespace test
 {
-const auto data_int_blk = framework::dataset::make("M", 800) * framework::dataset::make("N", 800) * framework::dataset::make("by", 8, 13) * framework::dataset::make("block", 4, 9);
+using NEGEMMLowpFixture = GEMMLowpMatrixMultiplyCoreFixture<Tensor, NEGEMMLowpMatrixMultiplyCore, Accessor>;
 
 TEST_SUITE(NEON)
 
-TEST_SUITE(INTERLEAVE_BLOCKED)
-using NEInterleaveBlocked            = NESynthetizeFunction<NEGEMMInterleaveBlockedKernel>;
-using NEGEMMInterleaveBlockedFixture = GEMMInterleaveBlockedFixture<Tensor, NEInterleaveBlocked, Accessor>;
-REGISTER_FIXTURE_DATA_TEST_CASE(InterleaveBlocked, NEGEMMInterleaveBlockedFixture, framework::DatasetMode::ALL, data_int_blk);
-TEST_SUITE_END()
-
-#if 0  //FIXME: enable when we update NEGEMMLowp interface to work without offsets
-TEST_SUITE(U32)
-using NEGEMMLowpFixture = GEMMLowpFixture<Tensor, NEGEMMLowp, Accessor>;
-REGISTER_FIXTURE_DATA_TEST_CASE(GEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, framework::dataset::make("M", 100, 120) * framework::dataset::make("N", 100,
-                                110)
-                                * framework::dataset::make("K", 16, 20));
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV1GEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, datasets::GoogLeNetInceptionV1GEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(MatrixMultiplyGEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::ALL, datasets::MatrixMultiplyGEMMDataset());
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogleNetGEMMLowp, NEGEMMLowpFixture, framework::DatasetMode::NIGHTLY, datasets::GoogleNetGEMMDataset());
 
 TEST_SUITE_END()
-#endif // defined(__aarch64__)
-
-TEST_SUITE_END()
-
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/benchmark/fixtures/GEMMLowpFixture.h b/tests/benchmark/fixtures/GEMMLowpFixture.h
index b640705990..4bd7dfd42f 100644
--- a/tests/benchmark/fixtures/GEMMLowpFixture.h
+++ b/tests/benchmark/fixtures/GEMMLowpFixture.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_GEMMFIXTURE
-#define ARM_COMPUTE_TEST_GEMMFIXTURE
+#ifndef ARM_COMPUTE_TEST_GEMMLOWPFIXTURE
+#define ARM_COMPUTE_TEST_GEMMLOWPFIXTURE
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
@@ -34,59 +34,26 @@ namespace arm_compute
 {
 namespace test
 {
-template <typename TensorType, typename Function, typename Accessor, bool Transposed = false>
-class GEMMInterleaveBlockedFixture : public framework::Fixture
-{
-public:
-    template <typename...>
-    void setup(size_t x, size_t y, int int_by, int block)
-    {
-        const float       interleave_by_f32 = int_by;
-        const TensorShape shape_a(x, y);
-        const TensorShape shape_b(static_cast<size_t>(x * interleave_by_f32), static_cast<size_t>(std::ceil(y / interleave_by_f32)));
-        // Create tensors
-        a = create_tensor<TensorType>(shape_a, DataType::U8, 1);
-        b = create_tensor<TensorType>(shape_b, DataType::U8, 1);
-
-        // Create and configure function
-        f.configure(&a, &b, int_by, block, Transposed);
-
-        // Allocate tensors
-        a.allocator()->allocate();
-        b.allocator()->allocate();
-    }
-    void run()
-    {
-        f.run();
-    }
-
-    void teardown()
-    {
-        a.allocator()->free();
-        b.allocator()->free();
-    }
-
-private:
-    TensorType a{};
-    TensorType b{};
-    Function   f{};
-};
-
 /** Fixture that can be used for NEON and CL */
 template <typename TensorType, typename Function, typename Accessor>
-class GEMMLowpFixture : public framework::Fixture
+class GEMMLowpMatrixMultiplyCoreFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(size_t m, size_t n, size_t k)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape shape_dst, float alpha, float beta)
     {
-        const TensorShape shape_a(k, m);
-        const TensorShape shape_b(n, k);
-        const TensorShape shape_c(n, m);
+        // TODO (COMPMID-717): The interface used for GEMMLowp is the same one used for GEMM in order to re-use the datasets
+        // However the interface for both GEMM and GEMMLowp should be reworked in order to accepts only the 3 dimensions M, N and K
+        ARM_COMPUTE_UNUSED(shape_c);
+        ARM_COMPUTE_UNUSED(alpha);
+        ARM_COMPUTE_UNUSED(beta);
+
+        // Note: The offsets for matrix A and matrix B are set to 0 in order to skip the computation for the offset contribution
+
         // Create tensors
-        a = create_tensor<TensorType>(shape_a, DataType::U8, 1);
-        b = create_tensor<TensorType>(shape_b, DataType::U8, 1);
-        c = create_tensor<TensorType>(shape_c, DataType::U32, 1);
+        a = create_tensor<TensorType>(shape_a, DataType::QASYMM8, 1, 0, QuantizationInfo(1.0f / 255.0f, 0));
+        b = create_tensor<TensorType>(shape_b, DataType::QASYMM8, 1, 0, QuantizationInfo(1.0f / 255.0f, 0));
+        c = create_tensor<TensorType>(shape_dst, DataType::S32, 1, 0, QuantizationInfo(1.0f / 255.0f, 0));
 
         // Create and configure function
         gemmlowp.configure(&a, &b, &c);
@@ -99,7 +66,6 @@ public:
         // Fill tensors
         library->fill_tensor_uniform(Accessor(a), 0);
         library->fill_tensor_uniform(Accessor(b), 1);
-        library->fill_tensor_uniform(Accessor(c), 2);
     }
     void run()
     {
@@ -122,4 +88,4 @@ private:
 
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_GEMMFIXTURE */
+#endif /* ARM_COMPUTE_TEST_GEMMLOWPFIXTURE */
diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h
index 87f879e70a..5c0230e262 100644
--- a/tests/datasets/LargeGEMMLowpDataset.h
+++ b/tests/datasets/LargeGEMMLowpDataset.h
@@ -42,7 +42,7 @@ class LargeGEMMLowpDataset final : public GEMMLowpDataset
 public:
     LargeGEMMLowpDataset()
     {
-        add_config(TensorShape(923U, 2U), TensorShape(871U, 923U), TensorShape(871U, 2U), 0, 0);
+        add_config(TensorShape(923U, 1U), TensorShape(871U, 923U), TensorShape(871U, 1U), 0, 0);
         add_config(TensorShape(923U, 429U), TensorShape(871U, 923U), TensorShape(871U, 429U), 0, 0);
         add_config(TensorShape(873U, 7U), TensorShape(784U, 873U), TensorShape(784U, 7U), -1, 3);
         add_config(TensorShape(873U, 513U), TensorShape(784U, 873U), TensorShape(784U, 513U), 0, 4);
diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index 1d4ab53be5..b6651bdb42 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h
@@ -42,7 +42,7 @@ class SmallGEMMLowpDataset final : public GEMMLowpDataset
 public:
     SmallGEMMLowpDataset()
     {
-        add_config(TensorShape(21U, 2U), TensorShape(43U, 21U), TensorShape(43U, 2U), 0, 0);
+        add_config(TensorShape(21U, 1U), TensorShape(43U, 21U), TensorShape(43U, 1U), 0, 0);
         add_config(TensorShape(21U, 13U), TensorShape(33U, 21U), TensorShape(33U, 13U), 0, 0);
         add_config(TensorShape(31U, 3U), TensorShape(72U, 31U), TensorShape(72U, 3U), -2, 13);
         add_config(TensorShape(52U, 13U), TensorShape(33U, 52U), TensorShape(33U, 13U), 0, 4);
-- 
cgit v1.2.1