From b3182b19251cd010baad8252e7607de7059ac986 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Fri, 4 Sep 2020 08:44:52 +0100
Subject: COMPMID-3157: Remove padding from NEGEMMTranspose1xWKernel

- Remove padding from NEGEMMTranspose1xWKernel
- Extend test for validating zero padding requirement

Change-Id: I9ce4ca95a500229b045dc140cfff21fdf7373700
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3920
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 96 +++++++---------------
 tests/validation/NEON/GEMM.cpp                     | 61 +++++++++-----
 2 files changed, 70 insertions(+), 87 deletions(-)

diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 951cb19679..a8adc45645 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -60,28 +60,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    // Configure window in case of configured output
-    if(output->total_size() != 0)
-    {
-        AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-
-    const bool window_changed = update_window_and_padding(win, input_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
@@ -97,16 +75,21 @@ void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
     _input  = input;
     _output = output;
 
+    const size_t vector_size = 16 / input->info()->element_size();
+
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window win = calculate_max_window(*input->info(), Steps(vector_size));
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
 }
 
 Status NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
@@ -136,52 +119,29 @@ void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, window);
     Iterator out(_output, win_out);
 
-    switch(_input->info()->element_size())
+    const size_t in_width     = _input->info()->dimension(0);
+    const size_t element_size = _input->info()->element_size();
+    const size_t out_stride   = _output->info()->strides_in_bytes()[1];
+    const size_t vector_size  = 16 / element_size;
+
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        case 1:
-        {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1];
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                // Output address = base addr + (y * 16) + (x / 16 ) * stride
-                const uint8_t *in_ptr  = in.ptr();
-                uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride;
-                vst1q_u8(out_ptr, vld1q_u8(in_ptr));
-            },
-            in, out);
-            break;
-        }
-        case 2:
+        const uint8_t *in_ptr  = in.ptr();
+        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+        for(size_t k = 0; k < vector_size; ++k)
         {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t);
-            execute_window_loop(window, [&](const Coordinates & id)
+            // If the input width is not multiple of W, we fill the reference with 0s
+            if((id.x() + k) >= in_width)
             {
-                // Output address = base addr + (y * 8) + (x / 8 ) * stride
-                const auto in_ptr  = reinterpret_cast<const uint16_t *>(in.ptr());
-                const auto out_ptr = reinterpret_cast<uint16_t *>(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride;
-                vst1q_u16(out_ptr, vld1q_u16(in_ptr));
-            },
-            in, out);
-            break;
-        }
-        case 4:
-        {
-            const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float);
-            execute_window_loop(window, [&](const Coordinates & id)
+                std::memset(out_ptr + k * element_size, 0, element_size);
+            }
+            else
             {
-                // Output address = base addr + (y * 4) + (x / 4 ) * stride
-                const auto in_ptr  = reinterpret_cast<const uint32_t *>(in.ptr());
-                const auto out_ptr = reinterpret_cast<uint32_t *>(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride;
-                vst1q_u32(out_ptr, vld1q_u32(in_ptr));
-            },
-            in, out);
-            break;
+                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+            }
         }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Element size not supported");
-            break;
-        }
-    }
+    },
+    in, out);
 }
 } // namespace arm_compute
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index 9105638a6e..dfac72f3a5 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -68,27 +68,23 @@ const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::d
 const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
 
 /** Zero padding test */
-bool validate_zero_padding(unsigned int m_value, unsigned int k_value)
+template <typename FunctionType>
+bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)
 {
-    const unsigned int M = m_value;
-    const unsigned int K = k_value;
-
-    const TensorShape lhs_shape(K, M);
-    const TensorShape lhs_shape_reshaped(K * 4, std::ceil(M / 4.0f));
+    const TensorShape in_shape(dim0_value, dim1_value);
 
     // Create tensors
-    Tensor lhs = create_tensor<Tensor>(lhs_shape, DataType::U32);
-    Tensor dst = create_tensor<Tensor>(lhs_shape_reshaped, DataType::U32);
+    Tensor in = create_tensor<Tensor>(in_shape, DataType::U32);
+    Tensor dst;
 
-    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Validate zero-padding
-    NEGEMMInterleave4x4Kernel lhs_reshape;
+    FunctionType func;
 
-    lhs_reshape.configure(&lhs, &dst);
+    func.configure(&in, &dst);
 
-    return lhs.info()->padding().empty();
+    return in.info()->padding().empty();
 }
 
 } // namespace
@@ -97,15 +93,42 @@ TEST_SUITE(NEON)
 TEST_SUITE(GEMM)
 
 TEST_SUITE(TRANSPOSE_1XW)
-using NEGEMMTranspose1xW        = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
-using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, float>;
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::F32))
+using NEGEMMTranspose1xW = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMTranspose1xWKernel, 4>;
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+                   framework::dataset::make("N", { 1, 23, 63, 101 }),
+                   framework::dataset::make("K", { 1, 47, 29, 27 })),
+               n_value, k_value)
+{
+    bool status = validate_zero_padding<NEGEMMTranspose1xWKernel>(n_value, k_value);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using NEGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, NEGEMMTranspose1xW, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // TRANSPOSE_1XW
 
@@ -117,7 +140,7 @@ DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
                    framework::dataset::make("K", { 1, 47, 29, 27 })),
                m_value, k_value)
 {
-    bool status = validate_zero_padding(m_value, k_value);
+    bool status = validate_zero_padding<NEGEMMInterleave4x4Kernel>(m_value, k_value);
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
-- 
cgit v1.2.1