aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2020-09-03 13:20:34 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2020-09-03 17:54:45 +0000
commitfeaea101da17b383fe85440b0820132d0e0fa97d (patch)
tree0bcbe39636efe56ba7b0799e39f2880a2b09ba45
parentec4dee8c68a3d0f6d63db184bfb2f4589429778e (diff)
downloadComputeLibrary-feaea101da17b383fe85440b0820132d0e0fa97d.tar.gz
COMPMID-3143: Remove padding from NEGEMMInterleave4x4Kernel
- Remove padding from NEGEMMInterleave4x4Kernel - Extend test for validating zero padding requirement Change-Id: I94abc271e005f9dd6e1721b185631f55f598dbfd Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3915 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h25
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp186
-rw-r--r--tests/validation/NEON/GEMM.cpp61
3 files changed, 143 insertions, 129 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 7ddbf4bca8..322932bab2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -77,15 +77,26 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
- /** Common signature for all the transpose functions
+ /** Template function to run gemm interleave 4x4
*
- * @param[in] input An input tensor. Data types supported: All
- * @param[out] output The output tensor. Data type supported: same as @p input
- * @param[in] window Region on which to execute the kernel.
+ * @tparam ScalarType Scalar datatype
+ *
+ * @param[in] input Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+ * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+ */
+ template <typename ScalarType>
+ void gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window);
+
+ /** Common signature for all the specialised gemm interleave 4x4 functions
+ *
+ * @param[in] input Input tensor. Data types supported: uint32_t, uint16_t and uint8_t
+ * @param[out] output Output tensor. Data types supported: uint32_t, uint16_t and uint8_t
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
*/
- using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
+ using GEMMInterleaveFunctionFuncPtr = void (NEGEMMInterleave4x4Kernel::*)(const ITensor *input, ITensor *output, const Window &window);
- GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
+ GEMMInterleaveFunctionFuncPtr _func;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 3d178316c6..8b4ad0da23 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -61,113 +61,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
- unsigned int num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
- constexpr unsigned int num_elems_processed_per_iteration_y = 4;
- bool window_changed = false;
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
- window_changed = window_changed || update_window_and_padding(win, input_access);
-
- // Configure window in case of configured output
- if(output->total_size() != 0)
- {
- AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
- window_changed = window_changed || update_window_and_padding(win, output_access);
- output_access.set_valid_region(win, input->valid_region());
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
- const size_t in_stride = input->info()->strides_in_bytes()[1];
-
- // Set window for output tensor
- Window win_out(window);
- win_out.scale(Window::DimY, 0.25f);
- Iterator in(input, window);
-
- win_out.set_dimension_step(Window::DimX, 32);
- Iterator out(output, win_out);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- const uint8x8x4_t data =
- {
- {
- vld1_u8(in.ptr() + 0 * in_stride),
- vld1_u8(in.ptr() + 1 * in_stride),
- vld1_u8(in.ptr() + 2 * in_stride),
- vld1_u8(in.ptr() + 3 * in_stride),
- }
- };
- vst4_u8(out.ptr(), data);
- },
- in, out);
-}
-
-void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
- const size_t in_stride = input->info()->strides_in_bytes()[1];
-
- // Set window for output tensor
- Window win_out(window);
- win_out.scale(Window::DimY, 0.25f);
- Iterator in(input, window);
-
- win_out.set_dimension_step(Window::DimX, 16);
- Iterator out(output, win_out);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- const uint16x4x4_t data =
- {
- {
- vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 0 * in_stride)),
- vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 1 * in_stride)),
- vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 2 * in_stride)),
- vld1_u16(reinterpret_cast<const uint16_t *>(in.ptr() + 3 * in_stride)),
- }
- };
- vst4_u16(reinterpret_cast<uint16_t *>(out.ptr()), data);
- },
- in, out);
-}
-
-void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window)
-{
- const size_t in_stride = input->info()->strides_in_bytes()[1];
-
- // Set window for output tensor
- Window win_out(window);
- win_out.scale(Window::DimY, 0.25f);
- Iterator in(input, window);
-
- win_out.set_dimension_step(Window::DimX, 16);
- Iterator out(output, win_out);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- const uint32x4x4_t data =
- {
- {
- vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 0 * in_stride)),
- vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 1 * in_stride)),
- vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 2 * in_stride)),
- vld1q_u32(reinterpret_cast<const uint32_t *>(in.ptr() + 3 * in_stride))
- }
- };
- vst4q_u32(reinterpret_cast<uint32_t *>(out.ptr()), data);
- },
- in, out);
-}
} // namespace
NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
@@ -191,33 +84,92 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
switch(input->info()->element_size())
{
case 1:
- _func = &gemm_interleave_8bit_elements;
+ _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint8_t>;
break;
case 2:
- _func = &gemm_interleave_16bit_elements;
+ _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint16_t>;
break;
case 4:
- _func = &gemm_interleave_32bit_elements;
+ _func = &NEGEMMInterleave4x4Kernel::gemm_interleave4x4<uint32_t>;
break;
default:
ARM_COMPUTE_ERROR_ON("Element size not supported");
break;
}
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ Window win = calculate_max_window(*input->info(), Steps(1, 4));
+
+ Coordinates coord;
+ coord.set_num_dimensions(output->info()->num_dimensions());
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
}
Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
return Status{};
}
+template <typename ScalarType>
+void NEGEMMInterleave4x4Kernel::gemm_interleave4x4(const ITensor *input, ITensor *output, const Window &window)
+{
+ const size_t window_start_x = window.x().start();
+ const size_t window_end_x = window.x().end();
+
+ const size_t in_height = input->info()->dimension(1);
+ const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+ const size_t partial_y = in_height % 4;
+
+ // Set window for the input tensor
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ // Set window for the output tensor
+ Window win_out(window);
+ win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+ win_out.scale(Window::DimY, 0.25f);
+
+ Iterator in(input, win);
+ Iterator out(output, win_out);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ if(id.y() + 4 <= static_cast<int>(in_height))
+ {
+ for(size_t x = window_start_x; x < window_end_x; ++x)
+ {
+ const ScalarType data[4] =
+ {
+ *(reinterpret_cast<const ScalarType *>(in.ptr() + 0 * in_stride) + x),
+ *(reinterpret_cast<const ScalarType *>(in.ptr() + 1 * in_stride) + x),
+ *(reinterpret_cast<const ScalarType *>(in.ptr() + 2 * in_stride) + x),
+ *(reinterpret_cast<const ScalarType *>(in.ptr() + 3 * in_stride) + x),
+ };
+ std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
+ }
+ }
+ else
+ {
+ for(size_t x = window_start_x; x < window_end_x; ++x)
+ {
+ ScalarType data[4] = { 0, 0, 0, 0 };
+
+ for(size_t y = 0; y < partial_y; ++y)
+ {
+ data[y] = *(reinterpret_cast<const ScalarType *>(in.ptr() + y * in_stride) + x);
+ }
+
+ std::memcpy(out.ptr() + x * 4 * sizeof(ScalarType), data, 4 * sizeof(ScalarType));
+ }
+ }
+ },
+ in, out);
+}
+
void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
@@ -233,5 +185,5 @@ void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info
*
* After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
*/
- (*_func)(_input, _output, window);
+ (this->*_func)(_input, _output, window);
}
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index f817390780..9105638a6e 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -67,6 +67,30 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
const auto data_transpose = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int k_value)
+{
+ const unsigned int M = m_value;
+ const unsigned int K = k_value;
+
+ const TensorShape lhs_shape(K, M);
+ const TensorShape lhs_shape_reshaped(K * 4, std::ceil(M / 4.0f));
+
+ // Create tensors
+ Tensor lhs = create_tensor<Tensor>(lhs_shape, DataType::U32);
+ Tensor dst = create_tensor<Tensor>(lhs_shape_reshaped, DataType::U32);
+
+ ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Validate zero-padding
+ NEGEMMInterleave4x4Kernel lhs_reshape;
+
+ lhs_reshape.configure(&lhs, &dst);
+
+ return lhs.info()->padding().empty();
+}
+
} // namespace
TEST_SUITE(NEON)
@@ -88,14 +112,41 @@ TEST_SUITE_END() // TRANSPOSE_1XW
TEST_SUITE(INTERLEAVE_4X4)
using NEGEMMInterleave4x4 = NESynthetizeFunctionWithZeroConstantBorder<NEGEMMInterleave4x4Kernel, 4>;
-TEST_SUITE(FP32)
-using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, float>;
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::F32))
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+ framework::dataset::make("M", { 1, 23, 63, 101 }),
+ framework::dataset::make("K", { 1, 47, 29, 27 })),
+ m_value, k_value)
+{
+ bool status = validate_zero_padding(m_value, k_value);
+ ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+TEST_SUITE(U32)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint32_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
{
// Validate output
validate(Accessor(_target), _reference);
}
-TEST_SUITE_END() // FP32
+TEST_SUITE_END() // U32
+
+TEST_SUITE(U16)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint16_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U16
+
+TEST_SUITE(U8)
+using NEGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, NEGEMMInterleave4x4, uint8_t>;
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
+{
+ // Validate output
+ validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // U8
TEST_SUITE_END() // INTERLEAVE_4X4