From feaea101da17b383fe85440b0820132d0e0fa97d Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Thu, 3 Sep 2020 13:20:34 +0100
Subject: COMPMID-3143: Remove padding from NEGEMMInterleave4x4Kernel

- Remove padding from NEGEMMInterleave4x4Kernel
- Extend test for validating zero padding requirement

Change-Id: I94abc271e005f9dd6e1721b185631f55f598dbfd
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3915
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h  |  25 ++-
 .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp     | 186 ++++++++-------------
 tests/validation/NEON/GEMM.cpp                     |  61 ++++++-
 3 files changed, 143 insertions(+), 129 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 7ddbf4bca8..322932bab2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,15 +77,26 @@ public:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    /** Common signature for all the transpose functions
+    /** Template function to run gemm interleave 4x4
      *
-     * @param[in]  input  An input tensor. Data types supported: All
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
+     * @tparam ScalarType Scalar datatype
+     *
+     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename ScalarType>
+    void gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window);
+
+    /** Common signature for all the specialised gemm interleave 4x4 functions
+     *
+     * @param[in]  input  Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    using GEMMInterleaveFunctionFuncPtr = void (NEGEMMInterleave4x4Kernel::*)(const ITensor *input, ITensor *output, const Window &window);
 
-    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
+    GEMMInterleaveFunctionFuncPtr _func;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 3d178316c6..8b4ad0da23 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,113 +61,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    unsigned int           num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    bool                   window_changed                      = false;
-
-    // Configure kernel window
-    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    window_changed = window_changed || update_window_and_padding(win, input_access);
-
-    // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.25f);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, 32);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x8x4_t data =
-        {
-            {
-                vld1_u8(in.ptr() + 0 * in_stride),
-                vld1_u8(in.ptr() + 1 * in_stride),
-                vld1_u8(in.ptr() + 2 * in_stride),
-                vld1_u8(in.ptr() + 3 * in_stride),
-            }
-        };
-        vst4_u8(out.ptr(), data);
-    },
-    in, out);
-}
-
-void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.25f);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, 16);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint16x4x4_t data =
-        {
-            {
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
-                vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
-            }
-        };
-        vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
-    },
-    in, out);
-}
-
-void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
-    const size_t in_stride = input->info()->strides_in_bytes()[1];
-
-    // Set window for output tensor
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.25f);
-    Iterator in(input, window);
-
-    win_out.set_dimension_step(Window::DimX, 16);
-    Iterator out(output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint32x4x4_t data =
-        {
-            {
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
-                vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
-            }
-        };
-        vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
-    },
-    in, out);
-}
 } // namespace
 
 NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
@@ -191,33 +84,92 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
     switch(input->info()->element_size())
     {
         case 1:
-            _func = &gemm_interleave_8bit_elements;
+            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint8_t>;
             break;
         case 2:
-            _func = &gemm_interleave_16bit_elements;
+            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint16_t>;
             break;
         case 4:
-            _func = &gemm_interleave_32bit_elements;
+            _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint32_t>;
             break;
         default:
             ARM_COMPUTE_ERROR_ON("Element size not supported");
             break;
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(1, 4));
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
 }
 
 Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
 
+template <typename ScalarType>
+void NEGEMMInterleave4x4Kernel::gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window)
+{
+    const size_t window_start_x = window.x().start();
+    const size_t window_end_x   = window.x().end();
+
+    const size_t in_height = input->info()->dimension(1);
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    const size_t partial_y = in_height % 4;
+
+    // Set window for the input tensor
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Set window for the output tensor
+    Window win_out(window);
+    win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win_out.scale(Window::DimY, 0.25f);
+
+    Iterator in(input, win);
+    Iterator out(output, win_out);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        if(id.y() + 4 <= static_cast<int>(in_height))
+        {
+            for(size_t x = window_start_x; x < window_end_x; ++x)
+            {
+                const ScalarType data[4] =
+                {
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 0 * in_stride) + x),
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 1 * in_stride) + x),
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 2 * in_stride) + x),
+                    *(reinterpret_cast<const ScalarType *>(in.ptr() + 3 * in_stride) + x),
+                };
+                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
+            }
+        }
+        else
+        {
+            for(size_t x = window_start_x; x < window_end_x; ++x)
+            {
+                ScalarType data[4] = { 0, 0, 0, 0 };
+
+                for(size_t y = 0; y < partial_y; ++y)
+                {
+                    data[y] = *(reinterpret_cast<const ScalarType *>(in.ptr() + y * in_stride) + x);
+                }
+
+                std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
+            }
+        }
+    },
+    in, out);
+}
+
 void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -233,5 +185,5 @@ void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info
     *
     *         After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
     */
-    (*_func)(_input, _output, window);
+    (this->*_func)(_input, _output, window);
 }
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index f817390780..9105638a6e 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,6 +67,30 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
 const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
 const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
 
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int k_value)
+{
+    const unsigned int M = m_value;
+    const unsigned int K = k_value;
+
+    const TensorShape lhs_shape(K, M);
+    const TensorShape lhs_shape_reshaped(K * 4, std::ceil(M / 4.0f));
+
+    // Create tensors
+    Tensor lhs = create_tensor<Tensor>(lhs_shape, DataType::U32);
+    Tensor dst = create_tensor<Tensor>(lhs_shape_reshaped, DataType::U32);
+
+    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Validate zero-padding
+    NEGEMMInterleave4x4Kernel lhs_reshape;
+
+    lhs_reshape.configure(&lhs, &dst);
+
+    return lhs.info()->padding().empty();
+}
+
 } // namespace
 
 TEST_SUITE(NEON)
@@ -88,14 +112,41 @@ TEST_SUITE_END() // TRANSPOSE_1XW
 TEST_SUITE(INTERLEAVE_4X4)
 using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMInterleave4x4Kernel, 4>;
 
-TEST_SUITE(FP32)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, float>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::F32))
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+                   framework::dataset::make("M", { 1, 23, 63, 101 }),
+                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+               m_value, k_value)
+{
+    bool status = validate_zero_padding(m_value, k_value);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // INTERLEAVE_4X4
 
-- 
cgit v1.2.1