aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-03-02 11:18:12 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:16 +0000
commitd2fab7315bac3a586f2f1b1c8d64f2441f89ca64 (patch)
tree33572f0fea29d24546850f3835703f9869726122
parent27c08abe6947b1ee5b266799f2bb2bf0a05d0def (diff)
downloadComputeLibrary-d2fab7315bac3a586f2f1b1c8d64f2441f89ca64.tar.gz
COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4)
Implemented Winograd Output Transform (2x2,3x3) on OpenCL Implemented CLWinogradConvolutionLayer on OpenCL Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/CL/CLKernels.h1
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h1
-rw-r--r--arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h81
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h40
-rw-r--r--arm_compute/runtime/CL/CLFunctions.h1
-rw-r--r--arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h97
-rw-r--r--src/core/CL/CLKernelLibrary.cpp1
-rw-r--r--src/core/CL/cl_kernels/gemm.cl127
-rw-r--r--src/core/CL/cl_kernels/winograd.cl247
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp21
-rw-r--r--src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp7
-rw-r--r--src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp7
-rw-r--r--src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp188
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp14
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp146
-rw-r--r--src/runtime/CL/functions/CLWinogradInputTransform.cpp2
-rw-r--r--tests/datasets/LargeConvolutionLayerDataset.h24
-rw-r--r--tests/datasets/SmallConvolutionLayerDataset.h12
-rw-r--r--tests/datasets/WinogradOutputTransformDataset.h153
-rw-r--r--tests/validation/CL/Winograd.cpp179
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp6
-rw-r--r--tests/validation/fixtures/WinogradLayerFixture.h120
-rw-r--r--tests/validation/reference/ConvolutionLayer.cpp2
-rw-r--r--tests/validation/reference/Winograd.cpp218
-rw-r--r--tests/validation/reference/Winograd.h3
26 files changed, 1506 insertions, 194 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index ef629c2e81..6f5c61523f 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -111,5 +111,6 @@
#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
#endif /* __ARM_COMPUTE_CLKERNELS_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 7260c4a4f6..ee7e7c0e97 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -84,6 +84,7 @@ private:
const ICLTensor *_input0;
const ICLTensor *_input1;
ICLTensor *_output;
+ bool _slide_matrix_b;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
new file mode 100644
index 0000000000..35117c65db
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__
+#define __ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the Winograd output transform kernel. */
+class CLWinogradOutputTransformKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLWinogradOutputTransformKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
+ /** Default destructor */
+ ~CLWinogradOutputTransformKernel() = default;
+ /** Set the input and output tensor.
+ *
+ * @param[in] input Source tensor with shape [C, N, 16, batches]. Data types supported: F32.
+ * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+ * @param[out] output Destination tensor with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
+ * @param[in] kernel_dims Kernel dimensions (Width and height). Currently only supported 3x3 kernels
+ * @param[in] output_convolved_dims Output dimensions after the convolution (Width and height)
+ * @param[in] num_tiles Number of tiles of size 2x2 in the output tensor along the X and Y direction
+ */
+ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
+ *
+ * @param[in] input Source tensor with shape [C, N, 16, batches]. Data types supported: F32.
+ * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
+ * @param[out] output Destination tensor with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
+ * @param[in] kernel_dims Kernel dimensions (Width and height). Currently only supported 3x3 kernels
+ * @param[in] output_convolved_dims Output dimensions after the convolution (Width and height)
+ * @param[in] num_tiles Number of tiles of size 2x2 in the output tensor along the X and Y direction
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_bias;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 1e90927a93..5344ce7e74 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -28,6 +28,8 @@
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include <cmath>
+
namespace arm_compute
{
namespace misc
@@ -233,19 +235,45 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp
return output_shape;
}
+
+inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &input, const Size2D &output_convolved_dims, DataLayout data_layout)
+{
+ TensorShape tensor_shape{ input.tensor_shape() };
+
+ // Output dimension
+ const unsigned int out_w = output_convolved_dims.width;
+ const unsigned int out_h = output_convolved_dims.height;
+ const unsigned int out_c = input.dimension(0);
+
+ tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH), out_w);
+ tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT), out_h);
+ tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL), out_c);
+
+ return tensor_shape;
+}
+
inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
{
const TensorShape input_shape{ input.tensor_shape() };
const TensorShape weights_shape{ weights.tensor_shape() };
- unsigned int output_width = 0;
- unsigned int output_height = 0;
- std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+ const size_t idx_width = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
+
+ const unsigned int input_width = input_shape[idx_width];
+ const unsigned int input_height = input_shape[idx_height];
+ const unsigned int weights_width = weights_shape[idx_width];
+ const unsigned int weights_height = weights_shape[idx_height];
+ const unsigned int weights_channel = weights_shape[idx_channel];
+ unsigned int output_width = 0;
+ unsigned int output_height = 0;
+ std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
TensorShape output_shape{ input_shape };
- output_shape.set(0, output_width);
- output_shape.set(1, output_height);
- output_shape.set(2, weights_shape[3]);
+ output_shape.set(idx_width, output_width);
+ output_shape.set(idx_height, output_height);
+ output_shape.set(idx_channel, weights_channel);
return output_shape;
}
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 7c2377aa09..adf240e642 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -107,6 +107,7 @@
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
#endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
new file mode 100644
index 0000000000..14de169236
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__
+#define __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__
+
+#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
+ *
+ * -# @ref CLWinogradInputTransform
+ * -# @ref CLWinogradFilterTransformKernel (only once)
+ * -# @ref CLGEMM
+ * -# @ref CLWinogradOutputTransformKernel
+ *
+ */
+class CLWinogradConvolutionLayer : public IFunction
+{
+public:
+ /** Default constructor */
+ CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Set the input and output tensors.
+ *
+ * @note: This function only works with 3x3 kernels and unit strides
+ *
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ */
+ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
+ *
+ * @note: This function only works with 3x3 kernels and unit strides
+ *
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info);
+
+ // Inherited methods overridden:
+ void run() override;
+
+private:
+ CLMemoryGroup _memory_group;
+ CLGEMM _batched_mm;
+ CLWinogradInputTransform _input_transform;
+ CLWinogradFilterTransformKernel _filter_transform;
+ CLWinogradOutputTransformKernel _output_transform;
+ CLTensor _input0;
+ CLTensor _input1;
+ CLTensor _batched_mm_output;
+ bool _is_first_run;
+};
+}
+#endif /* __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__ */
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 4b7fa8a3b3..9df2dcbacd 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -354,6 +354,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" },
{ "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" },
{ "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" },
+ { "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" },
{ "YUYV422_to_IYUV_bt709", "color_convert.cl" },
{ "YUYV422_to_NV12_bt709", "color_convert.cl" },
{ "YUYV422_to_RGB888_bt709", "color_convert.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index cba5eea437..a5b0acbe9c 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -162,6 +162,8 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -199,8 +201,18 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global float *src_addr_a = (__global float *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global float *src_addr_b = (__global float *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global float *src_end_addr_b = src_addr_b + COLS_B;
@@ -277,6 +289,9 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -314,8 +329,18 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global float *src_addr_a = (__global float *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global float *src_addr_b = (__global float *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+ __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global float *src_end_addr_b = src_addr_b + COLS_B;
@@ -510,6 +535,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -547,8 +574,18 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global half *src_addr_a = (__global half *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global half *src_addr_b = (__global half *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global half *src_end_addr_b = src_addr_b + COLS_B;
@@ -627,8 +664,9 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 8 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 8 bit fixed point format
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -666,8 +704,18 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global char *src_addr_a = src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
- __global char *src_addr_b = src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes;
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes);
+ __global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global char *src_end_addr_b = src_addr_b + COLS_B;
@@ -738,8 +786,9 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- *
- * @note: ALPHA must be passed in 16 bit fixed point format
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ * @note:ALPHA must be passed in 16 bit fixed point format
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -777,8 +826,18 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
// src_addr_a = address of matrix A
// src_addr_b = address of matrix B
- __global short *src_addr_a = (__global short *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
- __global short *src_addr_b = (__global short *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes);
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes);
+ __global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes);
// Compute end row address for matrix B
__global short *src_end_addr_b = src_addr_b + COLS_B;
@@ -845,6 +904,8 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -885,7 +946,13 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
// Add offset for batched GEMM
src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
@@ -1013,6 +1080,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1054,8 +1123,12 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
// Add offset for batched GEMM
src_addr.s0 += get_global_id(2) * src0_stride_z;
- // For convolution layer we do not want to slide the matrix B along Z
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
// Address boundary for matrix A
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
@@ -1251,6 +1324,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1293,8 +1368,12 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
// Add offset for batched GEMM
src_addr.s0 += get_global_id(2) * src0_stride_z;
- // For convolution layer we do not want to slide the matrix B along Z
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
// Address boundary for the matrix A
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
@@ -1460,6 +1539,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
* @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
* @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1500,7 +1581,13 @@ __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
// Add offset for batched GEMM
src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));
@@ -1636,6 +1723,8 @@ __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),
* @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
* @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
*
* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16
* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
@@ -1676,7 +1765,13 @@ __kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),
// Add offset for batched GEMM
src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));
diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
index 238e21a18a..25c129d0aa 100644
--- a/src/core/CL/cl_kernels/winograd.cl
+++ b/src/core/CL/cl_kernels/winograd.cl
@@ -23,8 +23,102 @@
*/
#include "helpers.h"
-#if defined(NUM_TILES_X)
+#if defined(NUM_CHANNELS)
+
+/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2
+ *
+ * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+ float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Transform the 3x3 tile in a 4x4 tile
+ float4 out0 = 0.0f;
+ float4 out1 = 0.0f;
+ float4 out2 = 0.0f;
+ float4 out3 = 0.0f;
+
+ // Row 0
+ out0.s0 = (w0.s0);
+ out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+ out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+ out0.s3 = (w0.s2);
+
+ // Row 1
+ out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+ out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+ // Row 2
+ out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+ out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+ out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+ out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+ // Row 3
+ out3.s0 = (w2.s0);
+ out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+ out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+ out3.s3 = (w2.s2);
+ int z = get_global_id(2);
+ int x0 = z / NUM_CHANNELS; // idx filter
+ int y0 = z % NUM_CHANNELS; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the 16 values across the 16 channels
+ *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0;
+ *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1;
+ *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2;
+ *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s3;
+ *(__global float *)(dst_addr + 8 * dst_stride_z) = out2.s0;
+ *(__global float *)(dst_addr + 9 * dst_stride_z) = out2.s1;
+ *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+ *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+ *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+ *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+ *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+ *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+}
+#endif // defined(NUM_CHANNELS)
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
/** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2
*
* @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
@@ -205,13 +299,12 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z));
vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
}
-#endif //defined(NUM_TILES_X)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP)
-#if defined(NUM_CHANNELS)
-
-/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2
+#if defined(NUM_TILES_X)
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2, the filter size 3x3 and the data format is NCHW
*
- * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -220,8 +313,6 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
@@ -232,72 +323,84 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
{
- Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS);
-
- const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
- // Load the values from the input tensor
- float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
- float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
- float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
-
- // Transform the 3x3 tile in a 4x4 tile
- float4 out0 = 0.0f;
- float4 out1 = 0.0f;
- float4 out2 = 0.0f;
- float4 out3 = 0.0f;
-
- // Row 0
- out0.s0 = (w0.s0);
- out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
- out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
- out0.s3 = (w0.s2);
-
- // Row 1
- out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
- out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
- out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
- out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
- // Row 2
- out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
- out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
- out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
- out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
- // Row 3
- out3.s0 = (w2.s0);
- out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
- out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
- out3.s3 = (w2.s2);
-
- int z = get_global_id(2);
- int x0 = z / NUM_CHANNELS; // idx filter
- int y0 = z % NUM_CHANNELS; // idx channel
+ // Each thread stores a 2x2 tile
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+ // Load the values across the 16 channels to compose the 4x4 tile
+ float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+ float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+ float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+ float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+
+ float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
+ float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
+ float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
+ float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
+
+ float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
+ float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
+ float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
+ float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+ float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
+ float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
+ float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
+ float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
+
+ // Compute the 2x2 output tile
+ float k0 = d01 + d11 + d21;
+ float k1 = d02 + d12 + d22;
+ float k2 = d11 - d21 - d31;
+ float k3 = d12 - d22 - d32;
+
+ // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+ // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+ // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+ // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+ float out00 = d10;
+ float out01 = -d13;
+ float out10 = d10;
+ float out11 = -d13;
+
+ out00 += d00 + d20 + k0 + k1;
+ out01 += k0 - k1 - (d03 + d23);
+ out10 += -d20 - d30 + k2 + k3;
+ out11 += k2 - k3 + d23 + d33;
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * 2;
+ int y_out = (y_in / NUM_TILES_X) * 2;
+ int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+ out10 += (float)b;
+ out11 += (float)b;
+#endif // defined(HAS_BIAS)
// Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
- // Store the 16 values across the 16 channels
- *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
- *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0;
- *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1;
- *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2;
- *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s3;
- *(__global float *)(dst_addr + 8 * dst_stride_z) = out2.s0;
- *(__global float *)(dst_addr + 9 * dst_stride_z) = out2.s1;
- *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
- *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
- *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
- *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
- *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
- *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+ // Store the 2x2 output tile
+ vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
}
-#endif // defined(NUM_CHANNELS)
+#endif // defined(NUM_TILES_X)
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 9c69800928..7b785bb8da 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -55,6 +55,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
if(!is_interleaved_transposed)
{
@@ -174,7 +175,7 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
} // namespace
CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
{
}
@@ -192,9 +193,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
- _input0 = input0;
- _input1 = input1;
- _output = output;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions();
const DataType data_type = input0->info()->data_type();
const int fp_pos = input0->info()->fixed_point_position();
@@ -257,6 +259,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
"-DALPHA=" + float_to_string_with_full_precision(alpha));
}
+ // Do not slide matrix B if _slide_matrix_b = false
+ build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
std::string kernel_name;
if(is_interleaved_transposed)
{
@@ -365,7 +370,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
Window slice_b = slice;
// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
// This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(_input1->info()->num_dimensions() < 3)
+ if(!_slide_matrix_b)
{
slice_b = slice_matrix_b;
}
@@ -374,9 +379,9 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
add_2D_tensor_argument(idx, _input0, slice);
add_2D_tensor_argument(idx, _input1, slice_b);
add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[3]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
enqueue(queue, *this, slice, _lws_hint);
}
while(window.slide_window_slice_3D(slice));
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 5489fde818..f69a39e4ad 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -76,15 +76,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
- window_changed = window_changed || update_window_and_padding(win, input_access);
// Configure window in case of configured output
if(output->total_size() != 0)
{
AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
- window_changed = window_changed || update_window_and_padding(win, output_access);
+ window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
}
+ else
+ {
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+ }
// Collapse along the Z direction
Window collapsed = win.collapse(win, Window::DimZ);
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 3dbbe157b2..655b82bf66 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -76,7 +76,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
window_changed = update_window_and_padding(win, input_access, output_access);
- output_access.set_valid_region(win, input->valid_region());
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
Window win_collapsed = win.collapse(win, Window::DimZ);
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 72adb5f358..3b9350f9ba 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -44,11 +44,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_dims.width != 3 || kernel_dims.height != 3, "Winograd input transform only supports 3x3 kernels");
ARM_COMPUTE_UNUSED(kernel_dims);
- const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, Size2D(3U, 3U));
-
// Validate configured output
if(output->total_size() != 0)
{
+ const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, kernel_dims);
+
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -151,7 +151,8 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &conv_info, const Size2D &kernel_dims)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, conv_info, kernel_dims));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_info, kernel_dims));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), conv_info, kernel_dims).first);
return Status{};
}
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
new file mode 100644
index 0000000000..c9823275eb
--- /dev/null
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_dims.width != 3 || kernel_dims.height != 3, "Only 3x3 kernels are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<unsigned int>(std::sqrt(input->dimension(2))) != 4, "Only 2x2 output tile is supported");
+ ARM_COMPUTE_UNUSED(kernel_dims);
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+ }
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, output_convolved_dims, DataLayout::NCHW));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ bool window_changed = false;
+
+ AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), 2), ceil_to_multiple(output->dimension(1), 2));
+
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+ window_changed = update_window_and_padding(win, input_access, bias_access, output_access);
+ }
+ else
+ {
+ window_changed = update_window_and_padding(win, input_access, output_access);
+ }
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+} // namespace
+
+CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel()
+ : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims,
+ const Size2D &num_tiles)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_UNUSED(kernel_dims);
+
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), output_convolved_dims, DataLayout::NCHW)));
+
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), kernel_dims, output_convolved_dims, num_tiles));
+
+ _input = input;
+ _bias = bias;
+ _output = output;
+
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS"));
+ build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("winograd_output_transform_2x2_3x3_nchw", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "winograd_output_transform_2x2_3x3";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims,
+ const Size2D &num_tiles)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, kernel_dims, output_convolved_dims, num_tiles));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get()).first);
+
+ return Status{};
+}
+
+void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+ slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ // Setup output slice
+ Window slice_out(slice);
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ if(_bias != nullptr)
+ {
+ unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+ Window slice_biases;
+ slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
+ add_1D_tensor_argument(idx1, _bias, slice_biases);
+ }
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+} \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index a06d94c1f5..172facfa78 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -66,17 +66,21 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLT
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
if(c != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B");
+ }
+
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
}
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
new file mode 100644
index 0000000000..5081cbac4e
--- /dev/null
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _is_first_run(true)
+{
+}
+
+void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+ // TODO(COMPMID-1013): This part will be removed
+ // Get indeces for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Kernel size
+ const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width];
+ const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height];
+
+ // Number of tiles along the X and Y direction
+ const unsigned int num_tiles_x = std::ceil((input->info()->tensor_shape().x() - (kernel_w - 1) + conv_info.pad_left() + conv_info.pad_right()) / 2.f);
+ const unsigned int num_tiles_y = std::ceil((input->info()->tensor_shape().y() - (kernel_h - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / 2.f);
+
+ // Compute output shape
+ const TensorShape output_convolved_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
+
+ // Manage intermediate tensors
+ _memory_group.manage(&_input0);
+ _memory_group.manage(&_batched_mm_output);
+
+ // Do not manage _input1 as it contains the weights
+
+ // Configure input transform
+ _input_transform.configure(input, &_input0, conv_info, Size2D(kernel_w, kernel_h));
+
+ // Configure filter transform
+ _filter_transform.configure(weights, &_input1);
+
+ // Configure batched matrix multiply
+ _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+ // Configure output transform
+ _output_transform.configure(&_batched_mm_output, biases, output, Size2D(kernel_w, kernel_h), Size2D(output_convolved_shape[idx_width], output_convolved_shape[idx_height]), Size2D(num_tiles_x,
+ num_tiles_y));
+
+ // Allocate temporary tensors
+ _input0.allocator()->allocate();
+ _input1.allocator()->allocate();
+ _batched_mm_output.allocator()->allocate();
+}
+
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+ // TODO(COMPMID-1013): This part will be removed
+ // Get indeces for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Kernel size
+ const unsigned int kernel_w = weights->tensor_shape()[idx_width];
+ const unsigned int kernel_h = weights->tensor_shape()[idx_height];
+
+ // Number of tiles along the X and Y direction
+ const unsigned int num_tiles_x = std::ceil((input->tensor_shape().x() - (kernel_w - 1) + conv_info.pad_left() + conv_info.pad_right()) / 2.f);
+ const unsigned int num_tiles_y = std::ceil((input->tensor_shape().y() - (kernel_h - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / 2.f);
+
+ // Compute output shape
+ const TensorShape output_convolved_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, Size2D(kernel_w, kernel_h));
+ const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, conv_info, Size2D(kernel_w, kernel_h)));
+
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1));
+
+ // Configure batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+
+ // Configure output transform
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, Size2D(kernel_w, kernel_h), Size2D(output_convolved_shape[idx_width],
+ output_convolved_shape[idx_height]),
+ Size2D(num_tiles_x, num_tiles_y)));
+
+ return Status{};
+}
+
+void CLWinogradConvolutionLayer::run()
+{
+ if(_is_first_run)
+ {
+ // Run filter transform
+ CLScheduler::get().enqueue(_filter_transform, false);
+
+ _is_first_run = false;
+ }
+
+ _memory_group.acquire();
+
+ // Run input transform
+ _input_transform.run();
+
+ // Run batched matrix multiplication
+ _batched_mm.run();
+
+ // Run output transform
+ CLScheduler::get().enqueue(_output_transform);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index 652f31ac74..0499d4cd2f 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
@@ -40,6 +40,6 @@ void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, co
Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &conv_info, const Size2D &kernel_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON(CLWinogradInputTransformKernel::validate(input, output, conv_info, kernel_dims));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, conv_info, kernel_dims));
return Status{};
}
diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 086b2e3def..ec8e09fa81 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,6 +37,28 @@ namespace test
{
namespace datasets
{
+class LargeWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
+{
+public:
+ LargeWinogradConvolutionLayer3x3Dataset()
+ {
+ // Kernel size 3
+ // Batch size 1
+ add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 1, 0));
+ add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 1));
+ add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 1, 0));
+ add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 1));
+ add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0));
+ // Batch size 3, 2 and 4
+ add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1));
+ add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1));
+ add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1));
+ }
+};
+
class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset
{
public:
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index adb61de8e2..696c396eef 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -37,10 +37,10 @@ namespace test
{
namespace datasets
{
-class SmallWinogradLayerDataset final : public ConvolutionLayerDataset
+class SmallWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset
{
public:
- SmallWinogradLayerDataset()
+ SmallWinogradConvolutionLayer3x3Dataset()
{
// Kernel size 3
// Batch size 1
@@ -48,8 +48,14 @@ public:
// Batch size 4
add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(21U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0));
add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 1));
+ }
+};
- // Kernel size 5
+class SmallWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset
+{
+public:
+ SmallWinogradConvolutionLayer5x5Dataset()
+ {
add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U, 1U), TensorShape(1U), TensorShape(4U, 4U, 1U), PadStrideInfo(1, 1, 0, 0));
add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 2, 2));
}
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
new file mode 100644
index 0000000000..c42d6c8ebd
--- /dev/null
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+#define ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+class WinogradOutputTransformDataset
+{
+public:
+ using type = std::tuple<TensorShape, Size2D, Size2D, Size2D, DataLayout>;
+
+ struct iterator
+ {
+ iterator(std::vector<TensorShape>::const_iterator a_it,
+ std::vector<Size2D>::const_iterator b_it,
+ std::vector<Size2D>::const_iterator c_it,
+ std::vector<Size2D>::const_iterator d_it,
+ std::vector<DataLayout>::const_iterator data_layout_it)
+ : _a_it{ std::move(a_it) },
+ _b_it{ std::move(b_it) },
+ _c_it{ std::move(c_it) },
+ _d_it{ std::move(d_it) },
+ _data_layout_it{ std::move(data_layout_it) }
+ {
+ }
+
+ std::string description() const
+ {
+ std::stringstream description;
+ description << "Input=" << *_a_it << ":";
+ description << "KernelDims=" << *_b_it << ":";
+ description << "OutputDims=" << *_c_it << ":";
+ description << "NumTiles=" << *_d_it << ":";
+ description << "DataLayout=" << *_data_layout_it;
+ return description.str();
+ }
+
+ WinogradOutputTransformDataset::type operator*() const
+ {
+ return std::make_tuple(*_a_it, *_b_it, *_c_it, *_d_it, *_data_layout_it);
+ }
+
+ iterator &operator++()
+ {
+ ++_a_it;
+ ++_b_it;
+ ++_c_it;
+ ++_d_it;
+ ++_data_layout_it;
+
+ return *this;
+ }
+
+ private:
+ std::vector<TensorShape>::const_iterator _a_it;
+ std::vector<Size2D>::const_iterator _b_it;
+ std::vector<Size2D>::const_iterator _c_it;
+ std::vector<Size2D>::const_iterator _d_it;
+ std::vector<DataLayout>::const_iterator _data_layout_it;
+ };
+
+ iterator begin() const
+ {
+ return iterator(_a_shapes.begin(), _b_dims.begin(), _c_dims.begin(), _d_dims.begin(), _data_layout.begin());
+ }
+
+ int size() const
+ {
+ return std::min(_a_shapes.size(), std::min(_b_dims.size(), std::min(_c_dims.size(), std::min(_d_dims.size(), _data_layout.size()))));
+ }
+
+ void add_config(TensorShape a, Size2D b, Size2D c, Size2D d, DataLayout data_layout)
+ {
+ _a_shapes.emplace_back(std::move(a));
+ _b_dims.emplace_back(std::move(b));
+ _c_dims.emplace_back(std::move(c));
+ _d_dims.emplace_back(std::move(d));
+ _data_layout.emplace_back(std::move(data_layout));
+ }
+
+protected:
+ WinogradOutputTransformDataset() = default;
+ WinogradOutputTransformDataset(WinogradOutputTransformDataset &&) = default;
+
+private:
+ std::vector<TensorShape> _a_shapes{};
+ std::vector<Size2D> _b_dims{};
+ std::vector<Size2D> _c_dims{};
+ std::vector<Size2D> _d_dims{};
+ std::vector<DataLayout> _data_layout{};
+};
+
+class SmallWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+ SmallWinogradOutputTransformDataset()
+ {
+ add_config(TensorShape(24U, 49U, 16U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+ add_config(TensorShape(13U, 6U, 16U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+ add_config(TensorShape(7U, 20U, 16U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+ add_config(TensorShape(24U, 49U, 16U, 3U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW);
+ add_config(TensorShape(13U, 6U, 16U, 2U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW);
+ add_config(TensorShape(7U, 20U, 16U, 5U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW);
+ }
+};
+
+class LargeWinogradOutputTransformDataset final : public WinogradOutputTransformDataset
+{
+public:
+ LargeWinogradOutputTransformDataset()
+ {
+ add_config(TensorShape(128U, 3136U, 16U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+ add_config(TensorShape(256U, 784U, 16U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+ add_config(TensorShape(512U, 169U, 16U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+ add_config(TensorShape(128U, 3136U, 16U, 3U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW);
+ add_config(TensorShape(256U, 784U, 16U, 2U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW);
+ add_config(TensorShape(512U, 169U, 16U, 5U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW);
+ }
+};
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET */
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 0b21ed2577..aa668fa575 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -22,17 +22,22 @@
* SOFTWARE.
*/
#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
+#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
#include "tests/CL/CLAccessor.h"
#include "tests/CL/Helper.h"
#include "tests/PaddingCalculator.h"
+#include "tests/datasets/LargeConvolutionLayerDataset.h"
#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/SmallConvolutionLayerDataset.h"
#include "tests/datasets/WinogradFilterTransformDataset.h"
#include "tests/datasets/WinogradInputTransformDataset.h"
+#include "tests/datasets/WinogradOutputTransformDataset.h"
#include "tests/framework/Asserts.h"
#include "tests/framework/Macros.h"
#include "tests/framework/datasets/Datasets.h"
@@ -47,7 +52,7 @@ namespace validation
{
namespace
{
-constexpr AbsoluteTolerance<float> tolerance_f32(0.0001f);
+constexpr AbsoluteTolerance<float> tolerance_f32(0.001f);
} // namespace
using namespace arm_compute::misc::shape_calculator;
@@ -65,9 +70,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported
TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32), // Kernel size not supported
TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32), // Strides not supported
- TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32), // valid
- TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32), // valid
- TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32) // valid
+ TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32) // Padding needed
}),
framework::dataset::make("OutputInfo", {
TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16),
@@ -96,7 +101,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
Size2D(3U, 3U),
Size2D(3U, 3U)
})),
- framework::dataset::make("Expected", { false, false, false, false, true, true, true })),
+ framework::dataset::make("Expected", { false, false, false, false, false, false, false })),
input_info, output_info, conv_info, kernel_dims, expected)
{
ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, kernel_dims)) == expected, framework::LogLevel::ERRORS);
@@ -203,8 +208,172 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::Da
// Validate output
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+
TEST_SUITE_END() // FilterTransform
+TEST_SUITE(OutputTransform)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo",{
+ TensorInfo(TensorShape(24U, 49U, 16U, 5U), 1, DataType::F16), // F16 not supported
+ TensorInfo(TensorShape(128U, 3136U, 16U, 5U), 1, DataType::QASYMM8), // QASYMM8 not supported
+ TensorInfo(TensorShape(256U, 784U, 16U, 5U), 1, DataType::F32), // Kernel size not supported
+ TensorInfo(TensorShape(512U, 169U, 16U, 5U), 1, DataType::F32), // Valid
+ TensorInfo(TensorShape(13U, 6U, 16U, 4U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(7U, 16U, 16U, 7U), 1, DataType::F32), // Valid
+ TensorInfo(TensorShape(1U, 442U, 16U, 37U), 1, DataType::F32) // Wrong number of tiles
+ }),
+ framework::dataset::make("BiasInfo", {
+ TensorInfo(TensorShape(24U), 1, DataType::F16),
+ TensorInfo(TensorShape(128U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(256U), 1, DataType::F32),
+ TensorInfo(TensorShape(512U), 1, DataType::F32),
+ TensorInfo(TensorShape(13U), 1, DataType::F32),
+ TensorInfo(TensorShape(7U), 1, DataType::F32),
+ TensorInfo(TensorShape(1U), 1, DataType::F32)
+ })),
+ framework::dataset::make("OutputInfo", {
+ TensorInfo(TensorShape(14U, 14U, 24U, 5U), 1, DataType::F16),
+ TensorInfo(TensorShape(112U, 112U, 128U, 5U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(55U, 55U, 256U, 5U), 1, DataType::F32),
+ TensorInfo(TensorShape(26U, 26U, 512U, 5U), 1, DataType::F32),
+ TensorInfo(TensorShape(5U, 4U, 13U, 4U), 1, DataType::F32),
+ TensorInfo(TensorShape(8U, 8U, 7U, 7U), 1, DataType::F32),
+ TensorInfo(TensorShape(51U, 33U, 1U, 37U), 1, DataType::F32)
+ })),
+ framework::dataset::make("KernelDims", {
+ Size2D(3U, 3U),
+ Size2D(3U, 3U),
+ Size2D(5U, 5U),
+ Size2D(3U, 3U),
+ Size2D(3U, 3U),
+ Size2D(3U, 3U),
+ Size2D(3U, 3U)
+ })),
+ framework::dataset::make("OutputDims", {
+ Size2D(14U, 14U),
+ Size2D(112U, 112U),
+ Size2D(55U, 55U),
+ Size2D(26U, 26U),
+ Size2D(5U, 4U),
+ Size2D(8U, 8U),
+ Size2D(51U, 33U)
+ })),
+ framework::dataset::make("NumTiles", {
+ Size2D(7U, 7U),
+ Size2D(56U, 56U),
+ Size2D(28U, 28U),
+ Size2D(13U, 13U),
+ Size2D(3U, 2U),
+ Size2D(4U, 4U),
+ Size2D(26U, 16U)
+ })),
+ framework::dataset::make("Expected", { false, false, false, true, false, true, false })),
+ input_info, bias_info, output_info, kernel_dims, output_dims, num_tiles, expected)
+{
+ ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), kernel_dims, output_dims, num_tiles)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradOutputTransform = CLSynthetizeFunctionWithZeroConstantBorder<CLWinogradOutputTransformKernel, 0>;
+using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture<CLTensor, CLAccessor, CLWinogradOutputTransform, float>;
+
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()),
+ framework::dataset::make("DataType", { DataType::F32 })),
+ shape_a, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type)
+{
+ TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), output_convolved_dims, data_layout);
+
+ // Create tensors
+ CLTensor a = create_tensor<CLTensor>(shape_a, data_type);
+ CLTensor b = create_tensor<CLTensor>(shape_b, data_type);
+
+ ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Create and configure function
+ CLWinogradOutputTransform winograd_output_transform;
+ winograd_output_transform.configure(&a, nullptr, &b, kernel_dims, output_convolved_dims, num_tiles);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+TEST_SUITE_END() // OutputTransform
+
+TEST_SUITE(ConvolutionLayer)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", {
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16), // FP16 not supported
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32), // Datatype mismatch
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported
+ TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32), // Padding needed
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32) // Kernel size not supported
+ }),
+ framework::dataset::make("WeightsInfo", {
+ TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32),
+ TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32),
+ TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16)
+ })),
+ framework::dataset::make("BiasesInfo", {
+ TensorInfo(TensorShape(19U), 1, DataType::F32),
+ TensorInfo(TensorShape(19U), 1, DataType::F32),
+ TensorInfo(TensorShape(21U), 1, DataType::F32),
+ TensorInfo(TensorShape(16U), 1, DataType::F32),
+ TensorInfo(TensorShape(16U), 1, DataType::F32)
+ })),
+ framework::dataset::make("OutputInfo", {
+ TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32),
+ TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32),
+ TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32),
+ TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32),
+ TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32)
+ })),
+ framework::dataset::make("ConvInfo", {
+ PadStrideInfo(1, 1, 1, 1),
+ PadStrideInfo(1, 1, 1, 1),
+ PadStrideInfo(1, 2, 0, 0),
+ PadStrideInfo(1, 1, 1, 1),
+ PadStrideInfo(1, 1, 1, 0)
+ })),
+ framework::dataset::make("Expected", { false, false, false, false, false })),
+ input_info, weights_info, bias_info, output_info, conv_info, expected)
+{
+ ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CLWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<CLTensor, CLAccessor, CLWinogradConvolutionLayer, float>;
+FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+ framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 })))
+{
+ // Validate output
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // ConvolutionLayer
+
TEST_SUITE_END() // Winograd
TEST_SUITE_END() // CL
} // namespace validation
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 59db279ac7..34306b381c 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -109,10 +109,12 @@ TEST_SUITE_END()
TEST_SUITE(WinogradLayer)
template <typename T>
-using NEWinogradLayerFixture = WinogradLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
+using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture<Tensor, Accessor, NEWinogradLayer, T>;
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradLayerFixture<float>, framework::DatasetMode::PRECOMMIT, datasets::SmallWinogradLayerDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(),
+ datasets::SmallWinogradConvolutionLayer5x5Dataset()),
+ framework::dataset::make("DataType", { DataType::F32 })))
{
// Validate output
validate(Accessor(_target), _reference, tolerance_f32);
diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h
index bfe1efce3b..9811c28008 100644
--- a/tests/validation/fixtures/WinogradLayerFixture.h
+++ b/tests/validation/fixtures/WinogradLayerFixture.h
@@ -48,14 +48,14 @@ namespace validation
using namespace arm_compute::misc::shape_calculator;
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class WinogradLayerValidationFixture : public framework::Fixture
+class WinogradConvolutionLayerValidationFixture : public framework::Fixture
{
public:
template <typename...>
- void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info)
+ void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, DataType data_type)
{
- _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info);
- _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info);
+ _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
+ _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type);
}
protected:
@@ -79,13 +79,14 @@ protected:
}
}
- TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+ TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+ DataType data_type)
{
// Create tensors
- TensorType src = create_tensor<TensorType>(input_shape, DataType::F32, 1);
- TensorType weights = create_tensor<TensorType>(weights_shape, DataType::F32, 1);
- TensorType bias = create_tensor<TensorType>(bias_shape, DataType::F32, 1);
- TensorType dst = create_tensor<TensorType>(output_shape, DataType::F32, 1);
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+ TensorType weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+ TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
// Create and configure function
FunctionType conv;
@@ -111,20 +112,20 @@ protected:
fill(AccessorType(src), 0, -1.f, 1.f);
fill(AccessorType(weights), 1, -1.f, 1.f);
fill(AccessorType(bias), 2, -1.f, 1.f);
- fill(AccessorType(dst), 3, -1.f, 1.f);
- // Compute NEWinogradLayer function
+ // Compute Winograd Convolution function
conv.run();
return dst;
}
- SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info)
+ SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info,
+ DataType data_type)
{
// Create reference
- SimpleTensor<T> src{ input_shape, DataType::F32, 1 };
- SimpleTensor<T> weights{ weights_shape, DataType::F32, 1 };
- SimpleTensor<T> bias{ bias_shape, DataType::F32, 1 };
+ SimpleTensor<T> src{ input_shape, data_type, 1 };
+ SimpleTensor<T> weights{ weights_shape, data_type, 1 };
+ SimpleTensor<T> bias{ bias_shape, data_type, 1 };
// Fill reference
fill(src, 0, -1.f, 1.f);
@@ -136,8 +137,6 @@ protected:
TensorType _target{};
SimpleTensor<T> _reference{};
- int _fractional_bits{};
- DataType _data_type{};
};
template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -178,7 +177,6 @@ protected:
{
ARM_COMPUTE_UNUSED(is_nchw_format);
- // Create tensors
TensorType src = create_tensor<TensorType>(input_shape, data_type);
TensorType dst = create_tensor<TensorType>(output_shape, data_type);
@@ -261,8 +259,8 @@ protected:
ARM_COMPUTE_UNUSED(is_nchw_format);
// Create tensors
- TensorType src = create_tensor<TensorType>(input_shape, data_type);
- TensorType dst = create_tensor<TensorType>(output_shape, data_type);
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1);
// Create and configure function
FunctionType filter_transform;
@@ -288,7 +286,7 @@ protected:
SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, bool is_nchw_format, DataType data_type)
{
- ARM_COMPUTE_ERROR_ON(!is_nchw_format);
+ ARM_COMPUTE_UNUSED(is_nchw_format);
// Create reference
SimpleTensor<T> src{ input_shape, data_type, 1 };
@@ -302,6 +300,86 @@ protected:
TensorType _target{};
SimpleTensor<T> _reference{};
};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class WinogradOutputTransformValidationFixture : public framework::Fixture
+{
+public:
+ template <typename...>
+ void setup(TensorShape input_shape, Size2D kernel_dims, Size2D output_convolved_dims, Size2D num_tiles, DataLayout data_layout, DataType data_type)
+ {
+ TensorShape output_shape = compute_winograd_output_transform_shape(TensorInfo(input_shape, 1, data_type), output_convolved_dims, data_layout);
+
+ _target = compute_target(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+ _reference = compute_reference(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type);
+ }
+
+protected:
+ template <typename U>
+ void fill(U &&tensor, int i, float min, float max)
+ {
+ switch(tensor.data_type())
+ {
+ case DataType::F32:
+ {
+ std::uniform_real_distribution<> distribution(min, max);
+ library->fill(tensor, distribution, i);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ library->fill_tensor_uniform(tensor, i);
+ break;
+ }
+ }
+ }
+
+ TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles, DataLayout data_layout,
+ DataType data_type)
+ {
+ // Create tensors
+ TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+ TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout);
+
+ // Create and configure function
+ FunctionType output_transform;
+ output_transform.configure(&src, nullptr, &dst, kernel_dims, output_convolved_dims, num_tiles);
+
+ ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Allocate tensors
+ src.allocator()->allocate();
+ dst.allocator()->allocate();
+
+ ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+ // Fill tensors
+ fill(AccessorType(src), 0, -1.f, 1.f);
+
+ output_transform.run();
+
+ return dst;
+ }
+
+ SimpleTensor<T> compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles,
+ DataLayout data_layout,
+ DataType data_type)
+ {
+ // Create reference
+ SimpleTensor<T> src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout };
+
+ // Fill reference
+ fill(src, 0, -1.f, 1.f);
+
+ return reference::winograd_output_transform<T>(src, output_shape, kernel_dims, num_tiles);
+ }
+
+ TensorType _target{};
+ SimpleTensor<T> _reference{};
+};
} // namespace validation
} // namespace test
} // namespace arm_compute
diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 24bbf32a30..f3db274935 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp
@@ -118,4 +118,4 @@ template SimpleTensor<uint8_t> convolution_layer(const SimpleTensor<uint8_t> &sr
} // namespace reference
} // namespace validation
} // namespace test
-} // namespace arm_compute
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 3ed55fb9fc..c760663b22 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -39,6 +39,87 @@ namespace reference
namespace
{
template <typename T>
+void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+{
+ // Simple tensor for the 3x3 input tile
+ SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+
+ // Simple tensor for the transformation matrix
+ SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+ // Simple tensor for the transformation matrix transpose
+ SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+
+ // Simple tensor for the 4x3 temporary tile
+ SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+
+ // Simple tensor for the 4x4 output tile
+ SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+
+ // Initialize transformation matrix
+ // 1 | 0 | 0
+ // 0.5 | 0.5 | 0.5
+ // 0.5 |-0.5 | 0.5
+ // 0 | 0 | 1
+ trans_matrix[0 + 0 * 3] = 1.0f;
+ trans_matrix[1 + 0 * 3] = 0.0f;
+ trans_matrix[2 + 0 * 3] = 0.0f;
+ trans_matrix[0 + 1 * 3] = 0.5f;
+ trans_matrix[1 + 1 * 3] = 0.5f;
+ trans_matrix[2 + 1 * 3] = 0.5f;
+ trans_matrix[0 + 2 * 3] = 0.5f;
+ trans_matrix[1 + 2 * 3] = -0.5f;
+ trans_matrix[2 + 2 * 3] = 0.5f;
+ trans_matrix[0 + 3 * 3] = 0.0f;
+ trans_matrix[1 + 3 * 3] = 0.0f;
+ trans_matrix[2 + 3 * 3] = 1.0f;
+
+ // Transpose the transformation matrix
+ transpose_matrix(trans_matrix, trans_matrix_transposed);
+
+ const int num_channels = in.shape()[2];
+ const int num_filters = in.shape()[3];
+ const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters);
+
+ for(int n = 0; n < num_batches; ++n)
+ {
+ for(int w = 0; w < num_filters; ++w)
+ {
+ for(int z = 0; z < num_channels; ++z)
+ {
+ // Load the 3x3 tile from the input tensor
+ get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+
+ // First transformation
+ matrix_multiply(trans_matrix, input_tile, tmp_tile);
+
+ // Second transformation
+ matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
+
+ // Store the 4x4 output tile across the 16 channels
+ const int output_offset = w + z * num_filters;
+ out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4];
+ out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4];
+ out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4];
+ out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4];
+ out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4];
+ out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4];
+ out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4];
+ out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4];
+ out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4];
+ out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4];
+ out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
+ out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
+ out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
+ out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
+ out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
+ out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+ }
+ }
+ }
+}
+
+template <typename T>
void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const PadStrideInfo &conv_info)
{
TensorShape shape4x4(4u, 4u);
@@ -112,56 +193,70 @@ void winograd_input_transform3x3(const SimpleTensor<T> &src, SimpleTensor<T> &ds
}
template <typename T>
-void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out)
+void winograd_output_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &out, int num_tiles_x)
{
+ ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16);
+ ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]);
+
// Simple tensor for the 3x3 input tile
- SimpleTensor<T> input_tile{ TensorShape(3u, 3u), in.data_type(), 1 };
+ SimpleTensor<T> input_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
// Simple tensor for the transformation matrix
- SimpleTensor<T> trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 };
+ SimpleTensor<T> trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 };
// Simple tensor for the transformation matrix transpose
- SimpleTensor<T> trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 };
+ SimpleTensor<T> trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 };
// Simple tensor for the 4x3 temporary tile
- SimpleTensor<T> tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 };
+ SimpleTensor<T> tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 };
// Simple tensor for the 4x4 output tile
- SimpleTensor<T> output_tile{ TensorShape(4u, 4u), in.data_type(), 1 };
+ SimpleTensor<T> output_tile{ TensorShape(2u, 2u), in.data_type(), 1 };
// Initialize transformation matrix
- // 1 | 0 | 0
- // 0.5 | 0.5 | 0.5
- // 0.5 |-0.5 | 0.5
- // 0 | 0 | 1
- trans_matrix[0 + 0 * 3] = 1.0f;
- trans_matrix[1 + 0 * 3] = 0.0f;
- trans_matrix[2 + 0 * 3] = 0.0f;
- trans_matrix[0 + 1 * 3] = 0.5f;
- trans_matrix[1 + 1 * 3] = 0.5f;
- trans_matrix[2 + 1 * 3] = 0.5f;
- trans_matrix[0 + 2 * 3] = 0.5f;
- trans_matrix[1 + 2 * 3] = -0.5f;
- trans_matrix[2 + 2 * 3] = 0.5f;
- trans_matrix[0 + 3 * 3] = 0.0f;
- trans_matrix[1 + 3 * 3] = 0.0f;
- trans_matrix[2 + 3 * 3] = 1.0f;
+ // 1 | 1 | 1 | 1
+ // 0 | 1 | -1 | -1
+ trans_matrix[0 + 0 * 4] = 1.0f;
+ trans_matrix[1 + 0 * 4] = 1.0f;
+ trans_matrix[2 + 0 * 4] = 1.0f;
+ trans_matrix[3 + 0 * 4] = 0.0f;
+ trans_matrix[0 + 1 * 4] = 0.0f;
+ trans_matrix[1 + 1 * 4] = 1.0f;
+ trans_matrix[2 + 1 * 4] = -1.0f;
+ trans_matrix[3 + 1 * 4] = -1.0f;
// Transpose the transformation matrix
transpose_matrix(trans_matrix, trans_matrix_transposed);
- const int num_channels = in.shape()[2];
- const int num_filters = in.shape()[3];
- const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters);
+ const int w_in = in.shape()[0];
+ const int h_in = in.shape()[1];
+ const int c_in = in.shape()[2];
+ const int w_out = out.shape()[0];
+ const int h_out = out.shape()[1];
+ const int c_out = out.shape()[2];
+ const int num_batches = in.shape().total_size() / (w_in * h_in * c_in);
+
+ // Input strides
+ const int stridey_in = w_in;
+ const int stridez_in = stridey_in * h_in;
+ const int stridew_in = stridez_in * c_in;
+
+ // Output strides
+ const int stridey_out = w_out;
+ const int stridez_out = stridey_out * h_out;
+ const int stridew_out = stridez_out * c_out;
for(int n = 0; n < num_batches; ++n)
{
- for(int w = 0; w < num_filters; ++w)
+ for(int y = 0; y < h_in; ++y)
{
- for(int z = 0; z < num_channels; ++z)
+ for(int x = 0; x < w_in; ++x)
{
- // Load the 3x3 tile from the input tensor
- get_tile(in, input_tile, Coordinates(0, 0, z, w, n));
+ // Load the 4x4 tile across the 16 channels of the input tensor
+ for(int z = 0; z < c_in; ++z)
+ {
+ input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)];
+ }
// First transformation
matrix_multiply(trans_matrix, input_tile, tmp_tile);
@@ -169,24 +264,29 @@ void winograd_filter_transform3x3(const SimpleTensor<T> &in, SimpleTensor<T> &ou
// Second transformation
matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile);
- // Store the 4x4 output tile across the 16 channels
- const int output_offset = w + z * num_filters;
- out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4];
- out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4];
- out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4];
- out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4];
- out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4];
- out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4];
- out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4];
- out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4];
- out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4];
- out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4];
- out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4];
- out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4];
- out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4];
- out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4];
- out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4];
- out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4];
+ // Store the 2x2 output tile
+ const int xo = (y % num_tiles_x) * 2;
+ const int yo = (y / num_tiles_x) * 2;
+ const int zo = x;
+
+ const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out);
+ out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2];
+
+ // Check out-of-bound writes
+ if(xo + 1 < w_out)
+ {
+ out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2];
+ }
+
+ if(yo + 1 < h_out)
+ {
+ out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2];
+ }
+
+ if((yo + 1 < h_out) && (xo + 1 < w_out))
+ {
+ out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2];
+ }
}
}
}
@@ -234,8 +334,32 @@ SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const Tenso
return out;
}
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format");
+ ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height);
+ ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area());
+
+ // Create reference
+ SimpleTensor<T> out{ output_shape, in.data_type(), 1 };
+
+ switch(kernel_dims.width)
+ {
+ case 3:
+ winograd_output_transform3x3(in, out, num_tiles.width);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Only supported 3x3 kernel");
+ break;
+ }
+
+ return out;
+}
+
template SimpleTensor<float> winograd_input_transform(const SimpleTensor<float> &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims);
template SimpleTensor<float> winograd_filter_transform(const SimpleTensor<float> &in, const TensorShape &output_shape);
+template SimpleTensor<float> winograd_output_transform(const SimpleTensor<float> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index ba8e5c1cb6..fa1a7f3f61 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h
@@ -41,6 +41,9 @@ SimpleTensor<T> winograd_input_transform(const SimpleTensor<T> &src, const Tenso
template <typename T>
SimpleTensor<T> winograd_filter_transform(const SimpleTensor<T> &in, const TensorShape &output_shape);
+
+template <typename T>
+SimpleTensor<T> winograd_output_transform(const SimpleTensor<T> &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles);
} // namespace reference
} // namespace validation
} // namespace test