aboutsummaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-03-02 11:18:12 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:16 +0000
commitd2fab7315bac3a586f2f1b1c8d64f2441f89ca64 (patch)
tree33572f0fea29d24546850f3835703f9869726122 /tests
parent27c08abe6947b1ee5b266799f2bb2bf0a05d0def (diff)
downloadComputeLibrary-d2fab7315bac3a586f2f1b1c8d64f2441f89ca64.tar.gz
COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4)
Implemented Winograd Output Transform (2x2,3x3) on OpenCL Implemented CLWinogradConvolutionLayer on OpenCL Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'tests')
-rw-r--r--tests/datasets/LargeConvolutionLayerDataset.h24
-rw-r--r--tests/datasets/SmallConvolutionLayerDataset.h12
-rw-r--r--tests/datasets/WinogradOutputTransformDataset.h153
-rw-r--r--tests/validation/CL/Winograd.cpp179
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp6
-rw-r--r--tests/validation/fixtures/WinogradLayerFixture.h120
-rw-r--r--tests/validation/reference/ConvolutionLayer.cpp2
-rw-r--r--tests/validation/reference/Winograd.cpp218
-rw-r--r--tests/validation/reference/Winograd.h3
9 files changed, 637 insertions, 80 deletions
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 086b2e3def..ec8e09fa81 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,6 +37,28 @@ namespace test
{
namespace datasets
{
+class LargeWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
+{
+public:
+ LargeWinogradConvolutionLayer3x3Dataset()
+ {
+ // Kernel size 3
+ // Batch size 1
+ add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 1, 0));
+ add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 1));
+ add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 1, 0));
+ add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
+ add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
+ // Batch size 3, 2 and 4
+ add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1));
+ add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1));
+ }
+};
+
class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset
{
public:
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index adb61de8e2..696c396eef 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -37,10 +37,10 @@ namespace test
{
namespace datasets
{
-class SmallWinogradLayerDataset final : public ConvolutionLayerDataset
+class SmallWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
{
public:
- SmallWinogradLayerDataset()
+ SmallWinogradConvolutionLayer3x3Dataset()
{
// Kernel size 3
// Batch size 1
@@ -48,8 +48,14 @@ public:
// Batch size 4
add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(21U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0));
add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 1));
+ }
+};
- // Kernel size 5
+class SmallWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset
+{
+public:
+ SmallWinogradConvolutionLayer5x5Dataset()
+ {
add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U, 1U), TensorShape(1U), TensorShape(4U, 4U, 1U), PadStrideInfo(1, 1, 0, 0));
add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 2, 2));
}
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
new file mode 100644
index 0000000000..c42d6c8ebd
--- /dev/null
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+#define ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class WinogradOutputTransformDataset
+{
+public:
+ using type = std::tuple<TensorShape, Size2D, Size2D, Size2D, DataLayout>;
+
+ struct iterator
+ {
+ iterator(std::vector<TensorShape>::const_iterator a_it,
+ std::vector<Size2D>::const_iterator b_it,
+ std::vector<Size2D>::const_iterator c_it,
+ std::vector<Size2D>::const_iterator d_it,
+ std::vector<DataLayout>::const_iterator data_layout_it)
+ : _a_it{ std::move(a_it) },
+ _b_it{ std::move(b_it) },
+ _c_it{ std::move(c_it) },
+ _d_it{ std::move(d_it) },
+ _data_layout_it{ std::move(data_layout_it) }
+ {
+ }
+
+ std::string description() const
+ {
+ std::stringstream description;
+ description << "Input=" << *_a_it << ":";
+ description << "KernelDims=" << *_b_it << ":";
+ description << "OutputDims=" << *_c_it << ":";
+ description << "NumTiles=" << *_d_it << ":";
+ description << "DataLayout=" << *_data_layout_it;
+ return description.str();
+ }
+
+ WinogradOutputTransformDataset::type operator*() const
+ {
+ return std::make_tuple(*_a_it, *_b_it, *_c_it, *_d_it, *_data_layout_it);
+ }
+
+ iterator &operator++()
+ {
+ ++_a_it;
+ ++_b_it;
+ ++_c_it;
+ ++_d_it;
+ ++_data_layout_it;
+
+ return *this;
+ }
+
+ private:
+ std::vector<TensorShape>::const_iterator _a_it;
+ std::vector<Size2D>::const_iterator _b_it;
+ std::vector<Size2D>::const_iterator _c_it;
+ std::vector<Size2D>::const_iterator _d_it;
+ std::vector<DataLayout>::const_iterator _data_layout_it;
+ };
+
+ iterator begin() const
+ {
+ return iterator(_a_shapes.begin(), _b_dims.begin(), _c_dims.begin(), _d_dims.begin(), _data_layout.begin());
+ }
+
+ int size() const
+ {
+ return std::min(_a_shapes.size(), std::min(_b_dims.size(), std::min(_c_dims.size(), std::min(_d_dims.size(), _data_layout.size()))));
+ }
+
+ void add_config(TensorShape a, Size2D b, Size2D c, Size2D d, DataLayout data_layout)
+ {
+ _a_shapes.emplace_back(std::move(a));
+ _b_dims.emplace_back(std::move(b));
+ _c_dims.emplace_back(std::move(c));
+ _d_dims.emplace_back(std::move(d));
+ _data_layout.emplace_back(std::move(data_layout));
+ }
+
+protected:
+ WinogradOutputTransformDataset() = default;
+ WinogradOutputTransformDataset(WinogradOutputTransformDataset &&) = default;
+
+private:
+ std::vector<TensorShape> _a_shapes{};
+ std::vector<Size2D> _b_dims{};
+ std::vector<Size2D> _c_dims{};
+ std::vector<Size2D> _d_dims{};
+ std::vector<DataLayout> _data_layout{};
+};
+
+class SmallWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+ SmallWinogradOutputTransformDataset()
+ {
+ add_config(TensorShape(24U, 49U, 16U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+ add_config(TensorShape(13U, 6U, 16U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+ add_config(TensorShape(7U, 20U, 16U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+ add_config(TensorShape(24U, 49U, 16U, 3U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+ add_config(TensorShape(13U, 6U, 16U, 2U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+ add_config(TensorShape(7U, 20U, 16U, 5U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+ }
+};
+
+class LargeWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+ LargeWinogradOutputTransformDataset()
+ {
+ add_config(TensorShape(128U, 3136U, 16U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+ add_config(TensorShape(256U, 784U, 16U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+ add_config(TensorShape(512U, 169U, 16U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+ add_config(TensorShape(128U, 3136U, 16U, 3U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+ add_config(TensorShape(256U, 784U, 16U, 2U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+ add_config(TensorShape(512U, 169U, 16U, 5U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+ }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET */
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 0b21ed2577..aa668fa575 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -22,17 +22,22 @@
* SOFTWARE.
*/
#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
#include "tests/CL/CLAccessor.h"
#include "tests/CL/Helper.h"
#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeConvolutionLayerDataset.h"
#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
#include "tests/datasets/WinogradFilterTransformDataset.h"
#include "tests/datasets/WinogradInputTransformDataset.h"
+#include "tests/datasets/WinogradOutputTransformDataset.h"
#include "tests/framework/Asserts.h"
#include "tests/framework/Macros.h"
#include "tests/framework/datasets/Datasets.h"
@@ -47,7 +52,7 @@ namespace validation
{
namespace
{
-constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
} // namespace
using namespace arm_compute::misc::shape_calculator;
@@ -65,9 +70,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32), // Kernel size not supported
TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32), // Strides not supported
- TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32), // valid
- TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32), // valid
- TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32) // valid
+ TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32) // Padding needed
}),
framework::dataset::make("OutputInfo", {
TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16),
@@ -96,7 +101,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
Size2D(3U, 3U),
Size2D(3U, 3U)
})),
- framework::dataset::make("Expected", { false, false, false, false, true, true, true })),
+ framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
input_info, output_info, conv_info, kernel_dims, expected)
{
ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, kernel_dims)) == expected, framework::LogLevel::ERRORS);
@@ -203,8 +208,172 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::Da
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+
TEST_SUITE_END() // FilterTransform
+TEST_SUITE(OutputTransform)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo",{
+ TensorInfo(TensorShape(24U, 49U, 16U, 5U), 1, DataType::F16), // F16 not supported
+ TensorInfo(TensorShape(128U, 3136U, 16U, 5U), 1, DataType::QASYMM8), // QASYMM8 not supported
+ TensorInfo(TensorShape(256U, 784U, 16U, 5U), 1, DataType::F32), // Kernel size not supported
+ TensorInfo(TensorShape(512U, 169U, 16U, 5U), 1, DataType::F32), // Valid
+ TensorInfo(TensorShape(13U, 6U, 16U, 4U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(7U, 16U, 16U, 7U), 1, DataType::F32), // Valid
+ TensorInfo(TensorShape(1U, 442U, 16U, 37U), 1, DataType::F32) // Wrong number of tiles
+ }),
+ framework::dataset::make("BiasInfo", {
+ TensorInfo(TensorShape(24U), 1, DataType::F16),
+ TensorInfo(TensorShape(128U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(256U), 1, DataType::F32),
+ TensorInfo(TensorShape(512U), 1, DataType::F32),
+ TensorInfo(TensorShape(13U), 1, DataType::F32),
+ TensorInfo(TensorShape(7U), 1, DataType::F32),
+ TensorInfo(TensorShape(1U), 1, DataType::F32)
+ })),
+ framework::dataset::make("OutputInfo", {
+ TensorInfo(TensorShape(14U, 14U, 24U, 5U), 1, DataType::F16),
+ TensorInfo(TensorShape(112U, 112U, 128U, 5U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(55U, 55U, 256U, 5U), 1, DataType::F32),
+ TensorInfo(TensorShape(26U, 26U, 512U, 5U), 1, DataType::F32),
+ TensorInfo(TensorShape(5U, 4U, 13U, 4U), 1, DataType::F32),
+ TensorInfo(TensorShape(8U, 8U, 7U, 7U), 1, DataType::F32),
+ TensorInfo(TensorShape(51U, 33U, 1U, 37U), 1, DataType::F32)
+ })),
+ framework::dataset::make("KernelDims", {
+ Size2D(3U, 3U),
+ Size2D(3U, 3U),
+ Size2D(5U, 5U),
+ Size2D(3U, 3U),
+ Size2D(3U, 3U),
+ Size2D(3U, 3U),
+ Size2D(3U, 3U)
+ })),
+ framework::dataset::make("OutputDims", {
+ Size2D(14U, 14U),
+ Size2D(112U, 112U),
+ Size2D(55U, 55U),
+ Size2D(26U, 26U),
+ Size2D(5U, 4U),
+ Size2D(8U, 8U),
+ Size2D(51U, 33U)
+ })),
+ framework::dataset::make("NumTiles", {
+ Size2D(7U, 7U),
+ Size2D(56U, 56U),
+ Size2D(28U, 28U),
+ Size2D(13U, 13U),
+ Size2D(3U, 2U),
+ Size2D(4U, 4U),
+ Size2D(26U, 16U)
+ })),
+ framework::dataset::make("Expected", { false, false, false, true, false, true, false })),
+ input_info, bias_info, output_info, kernel_dims, output_dims, num_tiles, expected)
+{
+ ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), kernel_dims, output_dims, num_tiles)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradOutputTransform = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
+using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()),
+ framework::dataset::make("DataType", { DataType::F32 })),
+ shape_a, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type)
+{
+ TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), output_convolved_dims, data_layout);
+
+ // Create tensors
+ CLTensor a = create_tensor<CLTensor>(shape_a, data_type);
+ CLTensor b = create_tensor<CLTensor>(shape_b, data_type);
+
+ ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Create and configure function
+ CLWinogradOutputTransform winograd_output_transform;
+ winograd_output_transform.configure(&a, nullptr, &b, kernel_dims, output_convolved_dims, num_tiles);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // OutputTransform
+
+TEST_SUITE(ConvolutionLayer)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", {
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16), // FP16 not supported
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32), // Datatype mismatch
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+ TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32) // Kernel size not supported
+ }),
+ framework::dataset::make("WeightsInfo", {
+ TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32),
+ TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+ TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+ })),
+ framework::dataset::make("BiasesInfo", {
+ TensorInfo(TensorShape(19U), 1, DataType::F32),
+ TensorInfo(TensorShape(19U), 1, DataType::F32),
+ TensorInfo(TensorShape(21U), 1, DataType::F32),
+ TensorInfo(TensorShape(16U), 1, DataType::F32),
+ TensorInfo(TensorShape(16U), 1, DataType::F32)
+ })),
+ framework::dataset::make("OutputInfo", {
+ TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
+ TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+ TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+ TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+ TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+ })),
+ framework::dataset::make("ConvInfo", {
+ PadStrideInfo(1, 1, 1, 1),
+ PadStrideInfo(1, 1, 1, 1),
+ PadStrideInfo(1, 2, 0, 0),
+ PadStrideInfo(1, 1, 1, 1),
+ PadStrideInfo(1, 1, 1, 0)
+ })),
+ framework::dataset::make("Expected", { false, false, false, false, false })),
+ input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+ ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+ framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // ConvolutionLayer
+
TEST_SUITE_END() // Winograd
TEST_SUITE_END() // CL
} // namespace validation
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 59db279ac7..34306b381c 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -109,10 +109,12 @@ TEST_SUITE_END()
TEST_SUITE(WinogradLayer)
template <typename T>
-using NEWinogradLayerFixture = WinogradLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
+using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradLayerFixture<float>, framework::DatasetMode::PRECOMMIT, datasets::SmallWinogradLayerDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+ datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+ framework::dataset::make("DataType", { DataType::F32 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h
index bfe1efce3b..9811c28008 100644
--- a/tests/validation/fixtures/WinogradLayerFixture.h
+++ b/tests/validation/fixtures/WinogradLayerFixture.h
@@ -48,14 +48,14 @@ namespace validation
using namespace arm_compute::misc::shape_calculator;
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WinogradLayerValidationFixture : public framework::Fixture
+class WinogradConvolutionLayerValidationFixture : public framework::Fixture
{
public:
template <typename...>
- void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info)
+ void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, DataType data_type)
{
- _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info);
- _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
+ _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
+ _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
}
protected:
@@ -79,13 +79,14 @@ protected:
}
}
- TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+ TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+ DataType data_type)
{
// Create tensors
- TensorType src = create_tensor<TensorType>(input_shape, DataType::F32, 1);
- TensorType weights = create_tensor<TensorType>(weights_shape, DataType::F32, 1);
- TensorType bias = create_tensor<TensorType>(bias_shape, DataType::F32, 1);
- TensorType dst = create_tensor<TensorType>(output_shape, DataType::F32, 1);
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+ TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+ TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
// Create and configure function
FunctionType conv;
@@ -111,20 +112,20 @@ protected:
fill(AccessorType(src), 0, -1.f, 1.f);
fill(AccessorType(weights), 1, -1.f, 1.f);
fill(AccessorType(bias), 2, -1.f, 1.f);
- fill(AccessorType(dst), 3, -1.f, 1.f);
- // Compute NEWinogradLayer function
+ // Compute Winograd Convolution function
conv.run();
return dst;
}
- SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+ SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+ DataType data_type)
{
// Create reference
- SimpleTensor<T> src{ input_shape, DataType::F32, 1 };
- SimpleTensor<T> weights{ weights_shape, DataType::F32, 1 };
- SimpleTensor<T> bias{ bias_shape, DataType::F32, 1 };
+ SimpleTensor<T> src{ input_shape, data_type, 1 };
+ SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+ SimpleTensor<T> bias{ bias_shape, data_type, 1 };
// Fill reference
fill(src, 0, -1.f, 1.f);
@@ -136,8 +137,6 @@ protected:
TensorType _target{};
SimpleTensor<T> _reference{};
- int _fractional_bits{};
- DataType _data_type{};
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -178,7 +177,6 @@ protected:
{
ARM_COMPUTE_UNUSED(is_nchw_format);
- // Create tensors
TensorType src = create_tensor<TensorType>(input_shape, data_type);
TensorType dst = create_tensor<TensorType>(output_shape, data_type);
@@ -261,8 +259,8 @@ protected:
ARM_COMPUTE_UNUSED(is_nchw_format);
// Create tensors
- TensorType src = create_tensor<TensorType>(input_shape, data_type);
- TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
// Create and configure function
FunctionType filter_transform;
@@ -288,7 +286,7 @@ protected:
SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, bool is_nchw_format, DataType data_type)
{
- ARM_COMPUTE_ERROR_ON(!is_nchw_format);
+ ARM_COMPUTE_UNUSED(is_nchw_format);
// Create reference
SimpleTensor<T> src{ input_shape, data_type, 1 };
@@ -302,6 +300,86 @@ protected:
TensorType _target{};
SimpleTensor<T> _reference{};
};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradOutputTransformValidationFixture : public framework::Fixture
+{
+public:
+ template <typename...>
+ void setup(TensorShape input_shape, Size2D kernel_dims, Size2D output_convolved_dims, Size2D num_tiles, DataLayout data_layout, DataType data_type)
+ {
+ TensorShape output_shape = compute_winograd_output_transform_shape(TensorInfo(input_shape, 1, data_type), output_convolved_dims, data_layout);
+
+ _target = compute_target(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+ _reference = compute_reference(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+ }
+
+protected:
+ template <typename U>
+ void fill(U &&tensor, int i, float min, float max)
+ {
+ switch(tensor.data_type())
+ {
+ case DataType::F32:
+ {
+ std::uniform_real_distribution<> distribution(min, max);
+ library->fill(tensor, distribution, i);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ library->fill_tensor_uniform(tensor, i);
+ break;
+ }
+ }
+ }
+
+ TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles, DataLayout data_layout,
+ DataType data_type)
+ {
+ // Create tensors
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+ // Create and configure function
+ FunctionType output_transform;
+ output_transform.configure(&src, nullptr, &dst, kernel_dims, output_convolved_dims, num_tiles);
+
+ ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Fill tensors
+ fill(AccessorType(src), 0, -1.f, 1.f);
+
+ output_transform.run();
+
+ return dst;
+ }
+
+ SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles,
+ DataLayout data_layout,
+ DataType data_type)
+ {
+ // Create reference
+ SimpleTensor<T> src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout };
+
+ // Fill reference
+ fill(src, 0, -1.f, 1.f);
+
+ return reference::winograd_output_transform<T>(src, output_shape, kernel_dims, num_tiles);
+ }
+
+ TensorType _target{};
+ SimpleTensor<T> _reference{};
+};
} // namespace validation
} // namespace test
} // namespace arm_compute
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 24bbf32a30..f3db274935 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -118,4 +118,4 @@ template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &sr
} // namespace reference
} // namespace validation
} // namespace test
-} // namespace arm_compute
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 3ed55fb9fc..c760663b22 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -39,6 +39,87 @@ namespace reference
namespace
{
template <typename T>
+void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+{
+ // Simple tensor for the 3x3 input tile
+ SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+
+ // Simple tensor for the transformation matrix
+ SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+ // Simple tensor for the transformation matrix transpose
+ SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+
+ // Simple tensor for the 4x3 temporary tile
+ SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+ // Simple tensor for the 4x4 output tile
+ SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+
+ // Initialize transformation matrix
+ // 1 | 0 | 0
+ // 0.5 | 0.5 | 0.5
+ // 0.5 |-0.5 | 0.5
+ // 0 | 0 | 1
+ trans_matrix[0 + 0 * 3] = 1.0f;
+ trans_matrix[1 + 0 * 3] = 0.0f;
+ trans_matrix[2 + 0 * 3] = 0.0f;
+ trans_matrix[0 + 1 * 3] = 0.5f;
+ trans_matrix[1 + 1 * 3] = 0.5f;
+ trans_matrix[2 + 1 * 3] = 0.5f;
+ trans_matrix[0 + 2 * 3] = 0.5f;
+ trans_matrix[1 + 2 * 3] = -0.5f;
+ trans_matrix[2 + 2 * 3] = 0.5f;
+ trans_matrix[0 + 3 * 3] = 0.0f;
+ trans_matrix[1 + 3 * 3] = 0.0f;
+ trans_matrix[2 + 3 * 3] = 1.0f;
+
+ // Transpose the transformation matrix
+ transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+ const int num_channels = in.shape()[2];
+ const int num_filters = in.shape()[3];
+ const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters);
+
+ for(int n = 0; n < num_batches; ++n)
+ {
+ for(int w = 0; w < num_filters; ++w)
+ {
+ for(int z = 0; z < num_channels; ++z)
+ {
+ // Load the 3x3 tile from the input tensor
+ get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+
+ // First transformation
+ matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+ // Second transformation
+ matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+ // Store the 4x4 output tile across the 16 channels
+ const int output_offset = w + z * num_filters;
+ out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4];
+ out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4];
+ out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4];
+ out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4];
+ out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4];
+ out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4];
+ out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4];
+ out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4];
+ out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4];
+ out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4];
+ out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
+ out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
+ out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
+ out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
+ out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
+ out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+ }
+ }
+ }
+}
+
+template <typename T>
void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
{
TensorShape shape4x4(4u, 4u);
@@ -112,56 +193,70 @@ void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &ds
}
template <typename T>
-void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
{
+ ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
+ ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
// Simple tensor for the 3x3 input tile
- SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+ SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
// Simple tensor for the transformation matrix
- SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+ SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
// Simple tensor for the transformation matrix transpose
- SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+ SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
// Simple tensor for the 4x3 temporary tile
- SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+ SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
// Simple tensor for the 4x4 output tile
- SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+ SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
// Initialize transformation matrix
- // 1 | 0 | 0
- // 0.5 | 0.5 | 0.5
- // 0.5 |-0.5 | 0.5
- // 0 | 0 | 1
- trans_matrix[0 + 0 * 3] = 1.0f;
- trans_matrix[1 + 0 * 3] = 0.0f;
- trans_matrix[2 + 0 * 3] = 0.0f;
- trans_matrix[0 + 1 * 3] = 0.5f;
- trans_matrix[1 + 1 * 3] = 0.5f;
- trans_matrix[2 + 1 * 3] = 0.5f;
- trans_matrix[0 + 2 * 3] = 0.5f;
- trans_matrix[1 + 2 * 3] = -0.5f;
- trans_matrix[2 + 2 * 3] = 0.5f;
- trans_matrix[0 + 3 * 3] = 0.0f;
- trans_matrix[1 + 3 * 3] = 0.0f;
- trans_matrix[2 + 3 * 3] = 1.0f;
+ // 1 | 1 | 1 | 1
+ // 0 | 1 | -1 | -1
+ trans_matrix[0 + 0 * 4] = 1.0f;
+ trans_matrix[1 + 0 * 4] = 1.0f;
+ trans_matrix[2 + 0 * 4] = 1.0f;
+ trans_matrix[3 + 0 * 4] = 0.0f;
+ trans_matrix[0 + 1 * 4] = 0.0f;
+ trans_matrix[1 + 1 * 4] = 1.0f;
+ trans_matrix[2 + 1 * 4] = -1.0f;
+ trans_matrix[3 + 1 * 4] = -1.0f;
// Transpose the transformation matrix
transpose_matrix(trans_matrix, trans_matrix_transposed);
- const int num_channels = in.shape()[2];
- const int num_filters = in.shape()[3];
- const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters);
+ const int w_in = in.shape()[0];
+ const int h_in = in.shape()[1];
+ const int c_in = in.shape()[2];
+ const int w_out = out.shape()[0];
+ const int h_out = out.shape()[1];
+ const int c_out = out.shape()[2];
+ const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+ // Input strides
+ const int stridey_in = w_in;
+ const int stridez_in = stridey_in * h_in;
+ const int stridew_in = stridez_in * c_in;
+
+ // Output strides
+ const int stridey_out = w_out;
+ const int stridez_out = stridey_out * h_out;
+ const int stridew_out = stridez_out * c_out;
for(int n = 0; n < num_batches; ++n)
{
- for(int w = 0; w < num_filters; ++w)
+ for(int y = 0; y < h_in; ++y)
{
- for(int z = 0; z < num_channels; ++z)
+ for(int x = 0; x < w_in; ++x)
{
- // Load the 3x3 tile from the input tensor
- get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+ // Load the 4x4 tile across the 16 channels of the input tensor
+ for(int z = 0; z < c_in; ++z)
+ {
+ input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+ }
// First transformation
matrix_multiply(trans_matrix, input_tile, tmp_tile);
@@ -169,24 +264,29 @@ void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &ou
// Second transformation
matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
- // Store the 4x4 output tile across the 16 channels
- const int output_offset = w + z * num_filters;
- out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4];
- out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4];
- out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4];
- out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4];
- out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4];
- out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4];
- out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4];
- out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4];
- out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4];
- out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4];
- out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
- out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
- out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
- out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
- out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
- out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+ // Store the 2x2 output tile
+ const int xo = (y % num_tiles_x) * 2;
+ const int yo = (y / num_tiles_x) * 2;
+ const int zo = x;
+
+ const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+ out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
+
+ // Check out-of-bound writes
+ if(xo + 1 < w_out)
+ {
+ out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
+ }
+
+ if(yo + 1 < h_out)
+ {
+ out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
+ }
+
+ if((yo + 1 < h_out) && (xo + 1 < w_out))
+ {
+ out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
+ }
}
}
}
@@ -234,8 +334,32 @@ SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const Tenso
return out;
}
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+ ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
+ ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
+
+ // Create reference
+ SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+ switch(kernel_dims.width)
+ {
+ case 3:
+ winograd_output_transform3x3(in, out, num_tiles.width);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
+ break;
+ }
+
+ return out;
+}
+
template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index ba8e5c1cb6..fa1a7f3f61 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h
@@ -41,6 +41,9 @@ SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &src, const Tenso
template <typename T>
SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape);
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
} // namespace reference
} // namespace validation
} // namespace test