COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4)

Implemented Winograd Output Transform (2x2,3x3) on OpenCL Implemented CLWinogradConvolutionLayer on OpenCL Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2018-03-02 11:18:12 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:49:16 +0000
commit: d2fab7315bac3a586f2f1b1c8d64f2441f89ca64 (patch)
tree: 33572f0fea29d24546850f3835703f9869726122 /tests
parent: 27c08abe6947b1ee5b266799f2bb2bf0a05d0def (diff)
download: ComputeLibrary-d2fab7315bac3a586f2f1b1c8d64f2441f89ca64.tar.gz
9 files changed, 637 insertions, 80 deletions
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 086b2e3def..ec8e09fa81 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,28 @@ namespace test
 {
 namespace datasets
 {
+class LargeWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
+{
+public:
+    LargeWinogradConvolutionLayer3x3Dataset()
+    {
+        // Kernel size 3
+        // Batch size 1
+        add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 1, 0));
+        add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
+        // Batch size 3, 2 and 4
+        add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1));
+        add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1));
+        add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
+
 class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset
 {
 public:
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index adb61de8e2..696c396eef 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -37,10 +37,10 @@ namespace test
 {
 namespace datasets
 {
-class SmallWinogradLayerDataset final : public ConvolutionLayerDataset
+class SmallWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
 {
 public:
-    SmallWinogradLayerDataset()
+    SmallWinogradConvolutionLayer3x3Dataset()
     {
         // Kernel size 3
         // Batch size 1
@@ -48,8 +48,14 @@ public:
         // Batch size 4
         add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(21U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 1));
+    }
+};
 
-        // Kernel size 5
+class SmallWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset
+{
+public:
+    SmallWinogradConvolutionLayer5x5Dataset()
+    {
         add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U, 1U), TensorShape(1U), TensorShape(4U, 4U, 1U), PadStrideInfo(1, 1, 0, 0));
         add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 2, 2));
     }
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
new file mode 100644
index 0000000000..c42d6c8ebd
--- /dev/null
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+#define ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class WinogradOutputTransformDataset
+{
+public:
+    using type = std::tuple<TensorShape, Size2D, Size2D, Size2D, DataLayout>;
+
+    struct iterator
+    {
+        iterator(std::vector<TensorShape>::const_iterator a_it,
+                 std::vector<Size2D>::const_iterator      b_it,
+                 std::vector<Size2D>::const_iterator      c_it,
+                 std::vector<Size2D>::const_iterator      d_it,
+                 std::vector<DataLayout>::const_iterator  data_layout_it)
+            : _a_it{ std::move(a_it) },
+              _b_it{ std::move(b_it) },
+              _c_it{ std::move(c_it) },
+              _d_it{ std::move(d_it) },
+              _data_layout_it{ std::move(data_layout_it) }
+        {
+        }
+
+        std::string description() const
+        {
+            std::stringstream description;
+            description << "Input=" << *_a_it << ":";
+            description << "KernelDims=" << *_b_it << ":";
+            description << "OutputDims=" << *_c_it << ":";
+            description << "NumTiles=" << *_d_it << ":";
+            description << "DataLayout=" << *_data_layout_it;
+            return description.str();
+        }
+
+        WinogradOutputTransformDataset::type operator*() const
+        {
+            return std::make_tuple(*_a_it, *_b_it, *_c_it, *_d_it, *_data_layout_it);
+        }
+
+        iterator &operator++()
+        {
+            ++_a_it;
+            ++_b_it;
+            ++_c_it;
+            ++_d_it;
+            ++_data_layout_it;
+
+            return *this;
+        }
+
+    private:
+        std::vector<TensorShape>::const_iterator _a_it;
+        std::vector<Size2D>::const_iterator      _b_it;
+        std::vector<Size2D>::const_iterator      _c_it;
+        std::vector<Size2D>::const_iterator      _d_it;
+        std::vector<DataLayout>::const_iterator  _data_layout_it;
+    };
+
+    iterator begin() const
+    {
+        return iterator(_a_shapes.begin(), _b_dims.begin(), _c_dims.begin(), _d_dims.begin(), _data_layout.begin());
+    }
+
+    int size() const
+    {
+        return std::min(_a_shapes.size(), std::min(_b_dims.size(), std::min(_c_dims.size(), std::min(_d_dims.size(), _data_layout.size()))));
+    }
+
+    void add_config(TensorShape a, Size2D b, Size2D c, Size2D d, DataLayout data_layout)
+    {
+        _a_shapes.emplace_back(std::move(a));
+        _b_dims.emplace_back(std::move(b));
+        _c_dims.emplace_back(std::move(c));
+        _d_dims.emplace_back(std::move(d));
+        _data_layout.emplace_back(std::move(data_layout));
+    }
+
+protected:
+    WinogradOutputTransformDataset()                                  = default;
+    WinogradOutputTransformDataset(WinogradOutputTransformDataset &&) = default;
+
+private:
+    std::vector<TensorShape> _a_shapes{};
+    std::vector<Size2D>      _b_dims{};
+    std::vector<Size2D>      _c_dims{};
+    std::vector<Size2D>      _d_dims{};
+    std::vector<DataLayout>  _data_layout{};
+};
+
+class SmallWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+    SmallWinogradOutputTransformDataset()
+    {
+        add_config(TensorShape(24U, 49U, 16U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+        add_config(TensorShape(13U, 6U, 16U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+        add_config(TensorShape(7U, 20U, 16U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+        add_config(TensorShape(24U, 49U, 16U, 3U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+        add_config(TensorShape(13U, 6U, 16U, 2U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+        add_config(TensorShape(7U, 20U, 16U, 5U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+    }
+};
+
+class LargeWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+    LargeWinogradOutputTransformDataset()
+    {
+        add_config(TensorShape(128U, 3136U, 16U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+        add_config(TensorShape(256U, 784U, 16U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+        add_config(TensorShape(512U, 169U, 16U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+        add_config(TensorShape(128U, 3136U, 16U, 3U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+        add_config(TensorShape(256U, 784U, 16U, 2U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+        add_config(TensorShape(512U, 169U, 16U, 5U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+    }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET */
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 0b21ed2577..aa668fa575 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -22,17 +22,22 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/CL/Helper.h"
 #include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeConvolutionLayerDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
 #include "tests/datasets/WinogradFilterTransformDataset.h"
 #include "tests/datasets/WinogradInputTransformDataset.h"
+#include "tests/datasets/WinogradOutputTransformDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -47,7 +52,7 @@ namespace validation
 {
 namespace
 {
-constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
 } // namespace
 
 using namespace arm_compute::misc::shape_calculator;
@@ -65,9 +70,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Kernel size not supported
                                                                                         TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32),     // Strides not supported
-                                                                                        TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32),         // valid
-                                                                                        TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32),     // valid
-                                                                                        TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32)         // valid
+                                                                                        TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32),         // Padding needed
+                                                                                        TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32),     // Padding needed
+                                                                                        TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32)         // Padding needed
                                                                                     }),
                                                 framework::dataset::make("OutputInfo", {
                                                                                         TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16),
@@ -96,7 +101,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                                                         Size2D(3U, 3U),
                                                                                         Size2D(3U, 3U)
                                                                                     })),
-                                                framework::dataset::make("Expected", { false, false, false, false, true, true, true })),
+                                                framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
                                             input_info, output_info, conv_info, kernel_dims, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, kernel_dims)) == expected, framework::LogLevel::ERRORS);
@@ -203,8 +208,172 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::Da
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE_END() // FilterTransform
 
+TEST_SUITE(OutputTransform)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                                                framework::dataset::make("InputInfo",{
+                                                                                        TensorInfo(TensorShape(24U, 49U, 16U, 5U), 1, DataType::F16),        // F16 not supported
+                                                                                        TensorInfo(TensorShape(128U, 3136U, 16U, 5U), 1, DataType::QASYMM8), // QASYMM8 not supported
+                                                                                        TensorInfo(TensorShape(256U, 784U, 16U, 5U), 1, DataType::F32),      // Kernel size not supported
+                                                                                        TensorInfo(TensorShape(512U, 169U, 16U, 5U), 1, DataType::F32),      // Valid
+                                                                                        TensorInfo(TensorShape(13U, 6U, 16U, 4U), 1, DataType::F32),         // Padding needed
+                                                                                        TensorInfo(TensorShape(7U, 16U, 16U, 7U), 1, DataType::F32),         // Valid
+                                                                                        TensorInfo(TensorShape(1U, 442U, 16U, 37U), 1, DataType::F32)        // Wrong number of tiles
+                                                                                    }),
+                                                framework::dataset::make("BiasInfo", {
+                                                                                        TensorInfo(TensorShape(24U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(128U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(256U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(512U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(13U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(1U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(14U, 14U, 24U, 5U), 1, DataType::F16),
+                                                                                        TensorInfo(TensorShape(112U, 112U, 128U, 5U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(55U, 55U, 256U, 5U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(26U, 26U, 512U, 5U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 4U, 13U, 4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(8U, 8U, 7U, 7U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(51U, 33U, 1U, 37U), 1, DataType::F32)
+                                                                                    })),
+                                                framework::dataset::make("KernelDims", {
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(5U, 5U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U),
+                                                                                        Size2D(3U, 3U)
+                                                                                    })),
+                                                framework::dataset::make("OutputDims", {
+                                                                                        Size2D(14U, 14U),
+                                                                                        Size2D(112U, 112U),
+                                                                                        Size2D(55U, 55U),
+                                                                                        Size2D(26U, 26U),
+                                                                                        Size2D(5U, 4U),
+                                                                                        Size2D(8U, 8U),
+                                                                                        Size2D(51U, 33U)
+                                                                                    })),
+                                                framework::dataset::make("NumTiles", {
+                                                                                        Size2D(7U, 7U),
+                                                                                        Size2D(56U, 56U),
+                                                                                        Size2D(28U, 28U),
+                                                                                        Size2D(13U, 13U),
+                                                                                        Size2D(3U, 2U),
+                                                                                        Size2D(4U, 4U),
+                                                                                        Size2D(26U, 16U)
+                                                                                    })),
+                                                framework::dataset::make("Expected", { false, false, false, true, false, true, false })),
+                                            input_info, bias_info, output_info, kernel_dims, output_dims, num_tiles, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), kernel_dims, output_dims, num_tiles)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradOutputTransform        = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
+using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()),
+                                                                   framework::dataset::make("DataType", { DataType::F32 })),
+               shape_a, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type)
+{
+    TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), output_convolved_dims, data_layout);
+
+    // Create tensors
+    CLTensor a = create_tensor<CLTensor>(shape_a, data_type);
+    CLTensor b = create_tensor<CLTensor>(shape_b, data_type);
+
+    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLWinogradOutputTransform winograd_output_transform;
+    winograd_output_transform.configure(&a, nullptr, &b, kernel_dims, output_convolved_dims, num_tiles);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // OutputTransform
+
+TEST_SUITE(ConvolutionLayer)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+                                                framework::dataset::make("InputInfo", {
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16),     // FP16 not supported
+                                                                                        TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32),     // Datatype mismatch
+                                                                                        TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+                                                                                        TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32),     // Padding needed
+                                                                                        TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32)  // Kernel size not supported
+                                                                                      }),
+                                                framework::dataset::make("WeightsInfo", {
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+                                                                                        })),
+                                                framework::dataset::make("BiasesInfo", {
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U), 1, DataType::F32)
+                                                                                       })),
+                                                framework::dataset::make("OutputInfo", {
+                                                                                        TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+                                                                                       })),
+                                                framework::dataset::make("ConvInfo", {
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 2, 0, 0),
+                                                                                        PadStrideInfo(1, 1, 1, 1),
+                                                                                        PadStrideInfo(1, 1, 1, 0)
+                                                                                                                 })),
+                                                framework::dataset::make("Expected", { false, false, false, false, false })),
+               input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                                                                                               framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // ConvolutionLayer
+
 TEST_SUITE_END() // Winograd
 TEST_SUITE_END() // CL
 } // namespace validation
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 59db279ac7..34306b381c 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -109,10 +109,12 @@ TEST_SUITE_END()
 
 TEST_SUITE(WinogradLayer)
 template <typename T>
-using NEWinogradLayerFixture = WinogradLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
+using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradLayerFixture<float>, framework::DatasetMode::PRECOMMIT, datasets::SmallWinogradLayerDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+                                                                                                                      datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+                                                                                                                      framework::dataset::make("DataType", { DataType::F32 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h
index bfe1efce3b..9811c28008 100644
--- a/tests/validation/fixtures/WinogradLayerFixture.h
+++ b/tests/validation/fixtures/WinogradLayerFixture.h
@@ -48,14 +48,14 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WinogradLayerValidationFixture : public framework::Fixture
+class WinogradConvolutionLayerValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info)
+    void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, DataType data_type)
     {
-        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info);
-        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
+        _target    = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
+        _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
     }
 
 protected:
@@ -79,13 +79,14 @@ protected:
         }
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                              DataType data_type)
     {
         // Create tensors
-        TensorType src     = create_tensor<TensorType>(input_shape, DataType::F32, 1);
-        TensorType weights = create_tensor<TensorType>(weights_shape, DataType::F32, 1);
-        TensorType bias    = create_tensor<TensorType>(bias_shape, DataType::F32, 1);
-        TensorType dst     = create_tensor<TensorType>(output_shape, DataType::F32, 1);
+        TensorType src     = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+        TensorType bias    = create_tensor<TensorType>(bias_shape, data_type, 1);
+        TensorType dst     = create_tensor<TensorType>(output_shape, data_type, 1);
 
         // Create and configure function
         FunctionType conv;
@@ -111,20 +112,20 @@ protected:
         fill(AccessorType(src), 0, -1.f, 1.f);
         fill(AccessorType(weights), 1, -1.f, 1.f);
         fill(AccessorType(bias), 2, -1.f, 1.f);
-        fill(AccessorType(dst), 3, -1.f, 1.f);
 
-        // Compute NEWinogradLayer function
+        // Compute Winograd Convolution function
         conv.run();
 
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+                                      DataType data_type)
     {
         // Create reference
-        SimpleTensor<T> src{ input_shape, DataType::F32, 1 };
-        SimpleTensor<T> weights{ weights_shape, DataType::F32, 1 };
-        SimpleTensor<T> bias{ bias_shape, DataType::F32, 1 };
+        SimpleTensor<T> src{ input_shape, data_type, 1 };
+        SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+        SimpleTensor<T> bias{ bias_shape, data_type, 1 };
 
         // Fill reference
         fill(src, 0, -1.f, 1.f);
@@ -136,8 +137,6 @@ protected:
 
     TensorType      _target{};
     SimpleTensor<T> _reference{};
-    int             _fractional_bits{};
-    DataType        _data_type{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -178,7 +177,6 @@ protected:
     {
         ARM_COMPUTE_UNUSED(is_nchw_format);
 
-        // Create tensors
         TensorType src = create_tensor<TensorType>(input_shape, data_type);
         TensorType dst = create_tensor<TensorType>(output_shape, data_type);
 
@@ -261,8 +259,8 @@ protected:
         ARM_COMPUTE_UNUSED(is_nchw_format);
 
         // Create tensors
-        TensorType src = create_tensor<TensorType>(input_shape, data_type);
-        TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
 
         // Create and configure function
         FunctionType filter_transform;
@@ -288,7 +286,7 @@ protected:
 
     SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, bool is_nchw_format, DataType data_type)
     {
-        ARM_COMPUTE_ERROR_ON(!is_nchw_format);
+        ARM_COMPUTE_UNUSED(is_nchw_format);
 
         // Create reference
         SimpleTensor<T> src{ input_shape, data_type, 1 };
@@ -302,6 +300,86 @@ protected:
     TensorType      _target{};
     SimpleTensor<T> _reference{};
 };
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradOutputTransformValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, Size2D kernel_dims, Size2D output_convolved_dims, Size2D num_tiles, DataLayout data_layout, DataType data_type)
+    {
+        TensorShape output_shape = compute_winograd_output_transform_shape(TensorInfo(input_shape, 1, data_type), output_convolved_dims, data_layout);
+
+        _target    = compute_target(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+        _reference = compute_reference(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, float min, float max)
+    {
+        switch(tensor.data_type())
+        {
+            case DataType::F32:
+            {
+                std::uniform_real_distribution<> distribution(min, max);
+                library->fill(tensor, distribution, i);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                library->fill_tensor_uniform(tensor, i);
+                break;
+            }
+        }
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles, DataLayout data_layout,
+                              DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+        // Create and configure function
+        FunctionType output_transform;
+        output_transform.configure(&src, nullptr, &dst, kernel_dims, output_convolved_dims, num_tiles);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src), 0, -1.f, 1.f);
+
+        output_transform.run();
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles,
+                                      DataLayout data_layout,
+                                      DataType   data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout };
+
+        // Fill reference
+        fill(src, 0, -1.f, 1.f);
+
+        return reference::winograd_output_transform<T>(src, output_shape, kernel_dims, num_tiles);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 24bbf32a30..f3db274935 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -118,4 +118,4 @@ template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &sr
 } // namespace reference
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 3ed55fb9fc..c760663b22 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -39,6 +39,87 @@ namespace reference
 namespace
 {
 template <typename T>
+void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+{
+    // Simple tensor for the 3x3 input tile
+    SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix
+    SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+    // Simple tensor for the transformation matrix transpose
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+
+    // Simple tensor for the 4x3 temporary tile
+    SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+    // Simple tensor for the 4x4 output tile
+    SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+
+    // Initialize transformation matrix
+    // 1   | 0   | 0
+    // 0.5 | 0.5 | 0.5
+    // 0.5 |-0.5 | 0.5
+    // 0   | 0   | 1
+    trans_matrix[0 + 0 * 3] = 1.0f;
+    trans_matrix[1 + 0 * 3] = 0.0f;
+    trans_matrix[2 + 0 * 3] = 0.0f;
+    trans_matrix[0 + 1 * 3] = 0.5f;
+    trans_matrix[1 + 1 * 3] = 0.5f;
+    trans_matrix[2 + 1 * 3] = 0.5f;
+    trans_matrix[0 + 2 * 3] = 0.5f;
+    trans_matrix[1 + 2 * 3] = -0.5f;
+    trans_matrix[2 + 2 * 3] = 0.5f;
+    trans_matrix[0 + 3 * 3] = 0.0f;
+    trans_matrix[1 + 3 * 3] = 0.0f;
+    trans_matrix[2 + 3 * 3] = 1.0f;
+
+    // Transpose the transformation matrix
+    transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+    const int num_channels = in.shape()[2];
+    const int num_filters  = in.shape()[3];
+    const int num_batches  = in.shape().total_size() / (9 * num_channels * num_filters);
+
+    for(int n = 0; n < num_batches; ++n)
+    {
+        for(int w = 0; w < num_filters; ++w)
+        {
+            for(int z = 0; z < num_channels; ++z)
+            {
+                // Load the 3x3 tile from the input tensor
+                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+
+                // First transformation
+                matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+                // Second transformation
+                matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+                // Store the 4x4 output tile across the 16 channels
+                const int output_offset                              = w + z * num_filters;
+                out[output_offset + 0 * num_filters * num_channels]  = output_tile[0 + 0 * 4];
+                out[output_offset + 1 * num_filters * num_channels]  = output_tile[1 + 0 * 4];
+                out[output_offset + 2 * num_filters * num_channels]  = output_tile[2 + 0 * 4];
+                out[output_offset + 3 * num_filters * num_channels]  = output_tile[3 + 0 * 4];
+                out[output_offset + 4 * num_filters * num_channels]  = output_tile[0 + 1 * 4];
+                out[output_offset + 5 * num_filters * num_channels]  = output_tile[1 + 1 * 4];
+                out[output_offset + 6 * num_filters * num_channels]  = output_tile[2 + 1 * 4];
+                out[output_offset + 7 * num_filters * num_channels]  = output_tile[3 + 1 * 4];
+                out[output_offset + 8 * num_filters * num_channels]  = output_tile[0 + 2 * 4];
+                out[output_offset + 9 * num_filters * num_channels]  = output_tile[1 + 2 * 4];
+                out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
+                out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
+                out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
+                out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
+                out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
+                out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+            }
+        }
+    }
+}
+
+template <typename T>
 void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
 {
     TensorShape shape4x4(4u, 4u);
@@ -112,56 +193,70 @@ void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &ds
 }
 
 template <typename T>
-void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
 {
+    ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
+    ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
     // Simple tensor for the 3x3 input tile
-    SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+    SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
 
     // Simple tensor for the transformation matrix
-    SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+    SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
 
     // Simple tensor for the transformation matrix transpose
-    SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+    SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
 
     // Simple tensor for the 4x3 temporary tile
-    SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+    SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
 
     // Simple tensor for the 4x4 output tile
-    SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+    SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
 
     // Initialize transformation matrix
-    // 1   | 0   | 0
-    // 0.5 | 0.5 | 0.5
-    // 0.5 |-0.5 | 0.5
-    // 0   | 0   | 1
-    trans_matrix[0 + 0 * 3] = 1.0f;
-    trans_matrix[1 + 0 * 3] = 0.0f;
-    trans_matrix[2 + 0 * 3] = 0.0f;
-    trans_matrix[0 + 1 * 3] = 0.5f;
-    trans_matrix[1 + 1 * 3] = 0.5f;
-    trans_matrix[2 + 1 * 3] = 0.5f;
-    trans_matrix[0 + 2 * 3] = 0.5f;
-    trans_matrix[1 + 2 * 3] = -0.5f;
-    trans_matrix[2 + 2 * 3] = 0.5f;
-    trans_matrix[0 + 3 * 3] = 0.0f;
-    trans_matrix[1 + 3 * 3] = 0.0f;
-    trans_matrix[2 + 3 * 3] = 1.0f;
+    // 1   | 1   | 1   | 1
+    // 0   | 1   | -1  | -1
+    trans_matrix[0 + 0 * 4] = 1.0f;
+    trans_matrix[1 + 0 * 4] = 1.0f;
+    trans_matrix[2 + 0 * 4] = 1.0f;
+    trans_matrix[3 + 0 * 4] = 0.0f;
+    trans_matrix[0 + 1 * 4] = 0.0f;
+    trans_matrix[1 + 1 * 4] = 1.0f;
+    trans_matrix[2 + 1 * 4] = -1.0f;
+    trans_matrix[3 + 1 * 4] = -1.0f;
 
     // Transpose the transformation matrix
     transpose_matrix(trans_matrix, trans_matrix_transposed);
 
-    const int num_channels = in.shape()[2];
-    const int num_filters  = in.shape()[3];
-    const int num_batches  = in.shape().total_size() / (9 * num_channels * num_filters);
+    const int w_in        = in.shape()[0];
+    const int h_in        = in.shape()[1];
+    const int c_in        = in.shape()[2];
+    const int w_out       = out.shape()[0];
+    const int h_out       = out.shape()[1];
+    const int c_out       = out.shape()[2];
+    const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+    // Input strides
+    const int stridey_in = w_in;
+    const int stridez_in = stridey_in * h_in;
+    const int stridew_in = stridez_in * c_in;
+
+    // Output strides
+    const int stridey_out = w_out;
+    const int stridez_out = stridey_out * h_out;
+    const int stridew_out = stridez_out * c_out;
 
     for(int n = 0; n < num_batches; ++n)
     {
-        for(int w = 0; w < num_filters; ++w)
+        for(int y = 0; y < h_in; ++y)
         {
-            for(int z = 0; z < num_channels; ++z)
+            for(int x = 0; x < w_in; ++x)
             {
-                // Load the 3x3 tile from the input tensor
-                get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+                // Load the 4x4 tile across the 16 channels of the input tensor
+                for(int z = 0; z < c_in; ++z)
+                {
+                    input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+                }
 
                 // First transformation
                 matrix_multiply(trans_matrix, input_tile, tmp_tile);
@@ -169,24 +264,29 @@ void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &ou
                 // Second transformation
                 matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
 
-                // Store the 4x4 output tile across the 16 channels
-                const int output_offset                              = w + z * num_filters;
-                out[output_offset + 0 * num_filters * num_channels]  = output_tile[0 + 0 * 4];
-                out[output_offset + 1 * num_filters * num_channels]  = output_tile[1 + 0 * 4];
-                out[output_offset + 2 * num_filters * num_channels]  = output_tile[2 + 0 * 4];
-                out[output_offset + 3 * num_filters * num_channels]  = output_tile[3 + 0 * 4];
-                out[output_offset + 4 * num_filters * num_channels]  = output_tile[0 + 1 * 4];
-                out[output_offset + 5 * num_filters * num_channels]  = output_tile[1 + 1 * 4];
-                out[output_offset + 6 * num_filters * num_channels]  = output_tile[2 + 1 * 4];
-                out[output_offset + 7 * num_filters * num_channels]  = output_tile[3 + 1 * 4];
-                out[output_offset + 8 * num_filters * num_channels]  = output_tile[0 + 2 * 4];
-                out[output_offset + 9 * num_filters * num_channels]  = output_tile[1 + 2 * 4];
-                out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
-                out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
-                out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
-                out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
-                out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
-                out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+                // Store the 2x2 output tile
+                const int xo = (y % num_tiles_x) * 2;
+                const int yo = (y / num_tiles_x) * 2;
+                const int zo = x;
+
+                const int output_offset                  = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+                out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
+
+                // Check out-of-bound writes
+                if(xo + 1 < w_out)
+                {
+                    out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
+                }
+
+                if(yo + 1 < h_out)
+                {
+                    out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
+                }
+
+                if((yo + 1 < h_out) && (xo + 1 < w_out))
+                {
+                    out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
+                }
             }
         }
     }
@@ -234,8 +334,32 @@ SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const Tenso
     return out;
 }
 
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+    ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
+    ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
+
+    // Create reference
+    SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+    switch(kernel_dims.width)
+    {
+        case 3:
+            winograd_output_transform3x3(in, out, num_tiles.width);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
+            break;
+    }
+
+    return out;
+}
+
 template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
 template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index ba8e5c1cb6..fa1a7f3f61 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h
@@ -41,6 +41,9 @@ SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &src, const Tenso
 
 template <typename T>
 SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape);
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
 } // namespace reference
 } // namespace validation
 } // namespace test
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2018-03-02 11:18:12 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:49:16 +0000
commit	d2fab7315bac3a586f2f1b1c8d64f2441f89ca64 (patch)
tree	33572f0fea29d24546850f3835703f9869726122 /tests
parent	27c08abe6947b1ee5b266799f2bb2bf0a05d0def (diff)
download	ComputeLibrary-d2fab7315bac3a586f2f1b1c8d64f2441f89ca64.tar.gz