aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-04-11 15:59:10 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:37 +0000
commite52a3000d2c13bc1b66ca66b3d12b6b836982394 (patch)
tree70e8ef5ba216762604f84228805aac9bd65747b6
parentdd03870b63784abe499761da2b26b209b33f2db2 (diff)
downloadComputeLibrary-e52a3000d2c13bc1b66ca66b3d12b6b836982394.tar.gz
COMPMID-1026 - Add support for 4x4 output tile in CLWinogradConvolutionLayer
The performance achieved can be found at the following confluence page: https://confluence.arm.com/display/MLENG/GEMM-based+convolution+vs+Winograd-based+convolution+on+OpenCL Change-Id: I4b690cfdd4eb4ff0cd17b14fdd49ccaa1d1dc85c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127729 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h6
-rw-r--r--arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h6
-rw-r--r--arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h10
-rw-r--r--arm_compute/runtime/CL/functions/CLConvolutionLayer.h4
-rw-r--r--arm_compute/runtime/CL/functions/CLWinogradInputTransform.h6
-rw-r--r--src/core/CL/CLKernelLibrary.cpp2
-rw-r--r--src/core/CL/cl_kernels/winograd.cl436
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp19
-rw-r--r--src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp8
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp19
-rw-r--r--src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp4
-rw-r--r--src/runtime/CL/functions/CLConvolutionLayer.cpp37
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp18
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp8
-rw-r--r--tests/datasets/WinogradInputTransformDataset.h21
-rw-r--r--tests/datasets/WinogradOutputTransformDataset.h16
-rw-r--r--tests/validation/CL/ConvolutionLayer.cpp37
-rw-r--r--tests/validation/CL/DilatedConvolutionLayer.cpp37
-rw-r--r--tests/validation/CL/Winograd.cpp14
19 files changed, 601 insertions, 107 deletions
diff --git a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
index 7115710d59..828e2e521a 100644
--- a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
@@ -49,8 +49,7 @@ public:
/** Set the input and output tensor.
*
* @note Winograd filter transform supports the following configurations:
- * Output tile size: 2x2, 4x4
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
* @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). Data types supported: F32.
@@ -61,8 +60,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLWinogradFilterTransformKernel
*
* @note Winograd filter transform supports the following configurations:
- * Output tile size: 2x2, 4x4
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
* @param[in] input Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). Data types supported: F32.
diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
index 2d1eadf3cf..b92ff2f60c 100644
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
@@ -47,8 +47,7 @@ public:
/** Set the input and output of the kernel.
*
* @note Winograd input transform supports the following configurations:
- * Output tile size: 2x2
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
* @param[in] input The input tensor to transform. Data types supported: F32
@@ -59,8 +58,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransformKernel
*
* @note Winograd input transform supports the following configurations:
- * Output tile size: 2x2
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
* @param[in] input The input tensor to transform. Data types supported: F32
diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
index b0d0bbeeaa..5e64a82e48 100644
--- a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
@@ -49,11 +49,10 @@ public:
/** Set the input and output tensor.
*
* @note Winograd output transform supports the following configurations:
- * Output tile size: 2x2
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
- * @param[in] input Source tensor with shape [C, N, 16, batches]. Data types supported: F32.
+ * @param[in] input Source tensor with shape [C, N, K, batches]. Data types supported: F32.
* @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
* @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
* @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
@@ -62,11 +61,10 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
*
* @note Winograd output transform supports the following configurations:
- * Output tile size: 2x2
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
- * @param[in] input Source tensor with shape [C, N, 16, batches]. Data types supported: F32.
+ * @param[in] input Source tensor with shape [C, N, K, batches]. Data types supported: F32.
* @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
* @param[out] output The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
* @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 7df765988f..a1cd15515f 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -26,6 +26,7 @@
#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
@@ -84,7 +85,6 @@ public:
* while every optional dimension from 4 and above represent a batch of inputs.
* Data types supported: QS8/QASYMM8/QS16/F16/F32.
* @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
* @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
* Data types supported: Same as @p input.
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
@@ -95,7 +95,7 @@ public:
*
* @return a status
*/
- static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U));
// Inherited methods overridden:
void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
index 0e0d6bf284..64d3e80bc9 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
@@ -40,8 +40,7 @@ public:
/** Set the input and output tensors.
*
* @note Winograd input transform supports the following configurations:
- * Output tile size: 2x2
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
* @param[in] input The input tensor to transform. Data types supported: F32
@@ -52,8 +51,7 @@ public:
/** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransform.
*
* @note Winograd input transform supports the following configurations:
- * Output tile size: 2x2
- * Kernel size: 3x3
+ * F(output tile, kernel size):F(2x2, 3x3), F(4x4, 3x3), F(4x4, 5x5)
* Strides: only unit strides
*
* @param[in] input The input tensor to transform. Data types supported: F32
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 50f623fffb..59be956ad8 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -363,7 +363,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl" },
{ "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" },
{ "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" },
+ { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl" },
{ "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" },
+ { "winograd_output_transform_4x4_3x3_nchw", "winograd.cl" },
{ "winograd_output_transform_4x4_5x5_nchw", "winograd.cl" },
{ "YUYV422_to_IYUV_bt709", "color_convert.cl" },
{ "YUYV422_to_NV12_bt709", "color_convert.cl" },
diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl
index cda23b0155..f40a969ea0 100644
--- a/src/core/CL/cl_kernels/winograd.cl
+++ b/src/core/CL/cl_kernels/winograd.cl
@@ -708,6 +708,265 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
}
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+ // Compute input address
+ __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * 4 * src_stride_x + y * 4 * src_stride_y + z * src_stride_z;
+
+ src_addr = src_addr - ((int)PAD_LEFT * src_stride_x) - ((int)PAD_TOP * src_stride_y);
+
+ // Row4
+ float4 d40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+ float2 d41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y));
+
+ float k0 = d41.s0;
+ float k1 = d41.s0;
+ float k2 = d41.s0;
+ float k3 = d41.s0;
+ float k4 = d41.s0;
+ float k5 = 0.0f;
+
+ k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+ k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+ k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+ k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+ k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+ k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+ // Row0
+ float4 d00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+ float2 d01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y));
+
+ // Row2
+ float4 d20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+ float2 d21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y));
+
+ // Compute destination address
+ __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * dst_stride_x + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+
+ uint dst_plane_stride = dst_stride_z / sizeof(float);
+
+ float out0 = k0;
+ float out1 = k1;
+ float out2 = k2;
+ float out3 = k3;
+ float out4 = k4;
+ float out5 = k5;
+ float out6 = k0;
+ float out7 = k1;
+ float out8 = k2;
+ float out9 = k3;
+ float out10 = k4;
+ float out11 = k5;
+ float out12 = k0;
+ float out13 = k1;
+ float out14 = k2;
+ float out15 = k3;
+ float out16 = k4;
+ float out17 = k5;
+ float out18 = k0;
+ float out19 = k1;
+ float out20 = k2;
+ float out21 = k3;
+ float out22 = k4;
+ float out23 = k5;
+ float out24 = k0;
+ float out25 = k1;
+ float out26 = k2;
+ float out27 = k3;
+ float out28 = k4;
+ float out29 = k5;
+
+ // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+ out0 += 16.0f * d00.s0 - 20.0f * d00.s2 - 20.0f * d20.s0 + 25.0f * d20.s2 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 - 20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 - 10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 + 4.0f * d01.s0 - 5.0f * d21.s0;
+ out5 += 16.0f * d00.s1 - 20.0f * d00.s3 - 20.0f * d20.s1 + 4.0f * d01.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+
+ *(dst_addr) = out0;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out1;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out2;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out3;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out4;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out5;
+ dst_addr += dst_plane_stride;
+
+ // Row1
+ float4 d10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+ float2 d11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y));
+
+ // Row3
+ float4 d30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+ float2 d31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y));
+
+ // Compute common parts for the channels between [6, 29]
+ // Channels [6, 11]: [out10, out11, out12, out13, out14, out15]
+ // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+ float part0 = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+ float part1 = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+ float part2 = 16.0f * d20.s2 - 4.0f * d21.s0;
+ float part3 = 16.0f * d20.s1 - 4.0f * d20.s3;
+ float part4 = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+ float part5 = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+ float part6 = 4.0f * d20.s2 - 4.0f * d21.s0;
+ float part7 = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+ float part8 = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+ float part9 = 8.0f * d20.s1 - 8.0f * d20.s3;
+ float part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+ float part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+ // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+ // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+ float part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+ float part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+ float part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+ float part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+ float part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+ float part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+ float part18 = part6 * 0.25f; // d20.s2 - d21.s0
+ float part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+ float part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+ float part21 = part9 * 0.25f; // 2.0f * (d20.s1 - d20.s3)
+ float part22 = part10 * 0.25f; // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+ float part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+ out6 += part0 - part1;
+ out12 += part0 + part1;
+ out7 += part2 + part3 + part4 + part5;
+ out8 += part2 - part3 + part4 - part5;
+ out13 += part2 + part3 - part4 - part5;
+ out14 += part2 - part3 - part4 + part5;
+ out9 += part6 + part7 + part8 + part9;
+ out10 += part6 - part7 + part8 - part9;
+ out15 += part6 - part7 - part8 + part9;
+ out16 += part6 + part7 - part8 - part9;
+ out11 += part10 + part11;
+ out17 += part10 - part11;
+
+ out18 += part13 - part12;
+ out24 += part13 + part12;
+ out19 += part14 + part15 + part16 + part17;
+ out20 += part14 - part15 + part16 - part17;
+ out25 += part14 - part15 - part16 + part17;
+ out26 += part14 + part15 - part16 - part17;
+ out21 += part18 + part19 + part20 + part21;
+ out22 += part18 - part19 + part20 - part21;
+ out27 += part18 - part19 - part20 + part21;
+ out28 += part18 + part19 - part20 - part21;
+ out23 += part22 + part23;
+ out29 += part22 - part23;
+
+ *(dst_addr) = out6;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out7;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out8;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out9;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out10;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out11;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out12;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out13;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out14;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out15;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out16;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out17;
+ dst_addr += dst_plane_stride;
+
+ *(dst_addr) = out18;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out19;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out20;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out21;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out22;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out23;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out24;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out25;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out26;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out27;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out28;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out29;
+ dst_addr += dst_plane_stride;
+
+ // Row5
+ float4 d50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y));
+ float2 d51 = vload2(2, (__global float *)(src_addr + 5 * src_stride_y));
+
+ // Channels [30, 35]
+ out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+ out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+ *(dst_addr) = out0;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out1;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out2;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out3;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out4;
+ dst_addr += dst_plane_stride;
+ *(dst_addr) = out5;
+ dst_addr += dst_plane_stride;
+}
+
#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
({ \
comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
@@ -981,6 +1240,183 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
}
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data format is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+ ,
+ VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+ // Each thread stores a 4x4 tile
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+
+ // Load the values across the 36 channels to compose the 6x6 tile
+ float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
+ float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
+ float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
+ float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+ float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
+ float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+
+ float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
+ float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
+ float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
+ float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
+ float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
+ float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+
+ float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
+ float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
+ float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
+ float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
+ float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
+ float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+
+ float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
+ float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
+ float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
+ float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
+ float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
+ float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+
+ float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
+ float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
+ float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
+ float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
+ float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
+ float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+
+ float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
+ float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
+ float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
+ float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
+ float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
+ float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+
+ // Compute out00, out01, out02 and out03
+ float out00 = d01 + d21 + d41 + d11 + d31;
+ float out01 = d01 + d21 + d41 + d11 + d31;
+ float out02 = d01 + d21 + d41 + d11 + d31;
+ float out03 = d01 + d21 + d41 + d11 + d31;
+
+ float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+ float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+ out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+ out01 += k1 - d02 - d12 - d22 - d32 - d42;
+ out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+ out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+ // Compute out10, out11, out12 and out13
+ float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+ float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+ k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+ k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+ out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+ out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+ out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+ out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+ // Compute out20, out21, out22 and out23
+ float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+ float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+ k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+ k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+ out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+ out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+ out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+ out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+ // Compute out30, out31, out32 and out33
+ float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+ float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+ k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+ k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+ out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+ out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+ out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+ out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+
+ int y_in = get_global_id(1);
+ int x_out = (y_in % NUM_TILES_X) * 4;
+ int y_out = (y_in / NUM_TILES_X) * 4;
+ int z_out = get_global_id(0);
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+ out02 += (float)b;
+ out03 += (float)b;
+
+ out10 += (float)b;
+ out11 += (float)b;
+ out12 += (float)b;
+ out13 += (float)b;
+
+ out20 += (float)b;
+ out21 += (float)b;
+ out22 += (float)b;
+ out23 += (float)b;
+
+ out30 += (float)b;
+ out31 += (float)b;
+ out32 += (float)b;
+ out33 += (float)b;
+
+#endif // defined(HAS_BIAS)
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z;
+
+ // Store the 4x4 output tile
+ vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+ vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+ vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
+ vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+}
+
#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
({ \
comm_fact.s0 = d1 + d2; \
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 0a0de7ad4e..805a594af6 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -286,17 +286,23 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
else // The input tensors have not been reshaped
{
build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
// Create kernels according to the architecture, data type and input size.
if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && is_data_type_float(data_type))
{
- kernel_name = "gemm_mm_floating_point_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
- // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
- // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
- // FC6 and FC7 of AlexNet and VGG-16).
- if(input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1 && data_type == DataType::F32)
+ kernel_name = "gemm_mm_floating_point";
+
+ if(input0->info()->num_dimensions() != 1)
+ {
+ kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+ }
+ else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
{
- kernel_name += "_1000";
+ // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+ // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+ // FC6 and FC7 of AlexNet and VGG-16).
+ kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
}
// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
@@ -309,7 +315,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
}
else // (MIDGARD and F32) or (F16)
{
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
kernel_name = "gemm_mm_floating_point";
}
build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index d3a33c01f9..41b3ac50b5 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -55,9 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U));
- ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) && output_tile_size != Size2D(4U, 4U));
- ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+ && output_tile_size != Size2D(4U, 4U),
+ "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels");
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index a47590d20f..febd22b04e 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -47,7 +47,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
const Size2D kernel_size = winograd_info.kernel_size;
ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U), "Winograd input transform only supports 2x2 output tile for 3x3 kernels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+ && output_tile_size != Size2D(4U, 4U),
+ "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels");
ARM_COMPUTE_UNUSED(conv_info);
ARM_COMPUTE_UNUSED(output_tile_size);
@@ -111,7 +113,6 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
const int num_elements_y = input->info()->dimension(1) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
// Check if we need to extend the right or bottom border
- // FIXME: This actually is not needed. Added just for validating the result;
const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1);
@@ -137,19 +138,13 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
// Check optimized kernel if output_dims == 2x2
- if(output_tile_size.width == 2 && output_tile_size.height == 2)
+ if(output_tile_size == Size2D(2U, 2U))
{
- if((_input->info()->dimension(2) % 2) != 0)
- {
- _step_z = 1;
- }
- else
- {
- _step_z = 2;
- _lws_hint = cl::NDRange(1, 1, 8);
- }
+ _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
}
+ _lws_hint = cl::NDRange(1, 1, 8);
+
// Append stepz and data layout
kernel_name += "_stepz";
kernel_name += support::cpp11::to_string(_step_z);
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 8ee1a82209..c5d2528aa2 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -58,6 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches");
// Compute number of elements to process in the X and Y direction
@@ -67,7 +68,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
const int num_tiles_y = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles_x * num_tiles_y)));
- ARM_COMPUTE_UNUSED(output_tile_size);
if(bias != nullptr)
{
@@ -207,4 +207,4 @@ void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue
enqueue(queue, *this, slice, _lws_hint);
}
while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-} \ No newline at end of file
+}
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index bcb5424aab..643e24d638 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -48,9 +48,16 @@ void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, c
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info));
- switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+ switch(CLConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
weights_info, act_info, CLScheduler::get().target(), dilation))
{
+ case ConvolutionMethod::WINOGRAD:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<CLWinogradConvolutionLayer>();
+ f->configure(input, weights, biases, output, conv_info);
+ _function = std::move(f);
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<CLDirectConvolutionLayer>();
@@ -79,8 +86,14 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
//Configure if the parameters match the direct convolution or the gemm-based
const GPUTarget gpu_target = CLScheduler::get().target();
- switch(CLConvolutionLayer::get_convolution_method(input, weights, biases, output, conv_info, weights_info, act_info, gpu_target, dilation))
+ switch(CLConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, act_info, gpu_target, dilation))
{
+ case ConvolutionMethod::WINOGRAD:
+ {
+ //Validate Winograd
+ CLWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info);
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
// Validate direct convolution layer
@@ -101,19 +114,25 @@ Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
return Status{};
}
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation)
{
- ARM_COMPUTE_UNUSED(input);
- ARM_COMPUTE_UNUSED(weights);
- ARM_COMPUTE_UNUSED(biases);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_UNUSED(conv_info);
ARM_COMPUTE_UNUSED(weights_info);
ARM_COMPUTE_UNUSED(gpu_target);
- ARM_COMPUTE_UNUSED(dilation);
- ARM_COMPUTE_UNUSED(act_info);
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+ if((input->data_type() == DataType::F32) && (input->data_layout() == DataLayout::NCHW) && (input->dimension(idx_c) > 3) && (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3)
+ && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1) && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)) && (!act_info.enabled()))
+ {
+ return ConvolutionMethod::WINOGRAD;
+ }
return ConvolutionMethod::GEMM;
}
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 0aa7f8d1b5..86ccddac88 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -44,13 +44,18 @@ void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *we
const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
// Input shape
- const TensorShape input_shape = input->info()->tensor_shape();
+ const TensorShape input_shape = input->info()->tensor_shape();
+ const unsigned int input_w = input->info()->tensor_shape()[idx_width];
+ const unsigned int input_h = input->info()->tensor_shape()[idx_height];
// Kernel size
const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width];
const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height];
- const WinogradInfo winograd_info = WinogradInfo(Size2D(2, 2),
+ //Winograd output tile
+ const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
Size2D(kernel_w, kernel_h),
Size2D(input_shape[idx_width], input_shape[idx_height]),
conv_info,
@@ -95,13 +100,18 @@ Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
// Input shape
- const TensorShape input_shape = input->tensor_shape();
+ const TensorShape input_shape = input->tensor_shape();
+ const unsigned int input_w = input->tensor_shape()[idx_width];
+ const unsigned int input_h = input->tensor_shape()[idx_height];
// Kernel size
const unsigned int kernel_w = weights->tensor_shape()[idx_width];
const unsigned int kernel_h = weights->tensor_shape()[idx_height];
- const WinogradInfo winograd_info = WinogradInfo(Size2D(2, 2),
+ //Winograd output tile
+ const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w <= 4 && input_h <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
Size2D(kernel_w, kernel_h),
Size2D(input_shape[idx_width], input_shape[idx_height]),
conv_info,
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index afc354533d..b0603e92d2 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -109,10 +109,12 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_UNUSED(weights_info);
- ARM_COMPUTE_UNUSED(act_info);
- if((input->data_type() == DataType::F32) && (weights->dimension(0) == 3) && (weights->dimension(1) == 3) && (weights->num_dimensions() <= 4) && (conv_info.stride().first == 1)
- && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)))
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+ if((input->data_type() == DataType::F32) && (input->data_layout() == DataLayout::NCHW) && (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3) && (weights->num_dimensions() <= 4)
+ && (conv_info.stride().first == 1) && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)) && (!act_info.enabled()))
{
//FIXME Until COMPMID-1041 is implemented Winograd is slower than GEMM on A53.
if(Scheduler::get().cpu_info().get_cpu_model() != CPUModel::A53)
diff --git a/tests/datasets/WinogradInputTransformDataset.h b/tests/datasets/WinogradInputTransformDataset.h
index cbe63645de..59fb2adbd6 100644
--- a/tests/datasets/WinogradInputTransformDataset.h
+++ b/tests/datasets/WinogradInputTransformDataset.h
@@ -102,6 +102,7 @@ class SmallWinogradInputTransformDataset final : public WinogradInputTransformDa
public:
SmallWinogradInputTransformDataset()
{
+ // (2x2, 3x3)
add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
@@ -109,6 +110,17 @@ public:
add_config(TensorShape(27U, 13U, 2U, 4U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(9U, 9U, 3U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
add_config(TensorShape(14U, 14U, 512U, 2U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+
+ // (4x4, 3x3)
+ add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(9U, 9U, 3U, 4U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(27U, 13U, 2U, 4U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(27U, 13U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(9U, 9U, 3U, 5U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(14U, 14U, 512U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+
+ // (4x4, 5x5)
add_config(TensorShape(9U, 9U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(9U, 9U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(27U, 13U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(27U, 13U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
add_config(TensorShape(128U, 64U, 1U, 3U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(128U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
@@ -124,10 +136,19 @@ class LargeWinogradInputTransformDataset final : public WinogradInputTransformDa
public:
LargeWinogradInputTransformDataset()
{
+ // (2x2, 3x3)
add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
add_config(TensorShape(83U, 72U, 14U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(83U, 72U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+
+ // (4x4, 3x3)
+ add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(83U, 72U, 14U, 5U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(83U, 72U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+
+ // (4x4, 5x5)
add_config(TensorShape(42U, 37U, 8U, 15U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(42U, 37U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(57U, 60U, 13U, 8U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(57U, 60U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(128U, 64U, 21U, 13U), WinogradInfo(Size2D(4U, 4U), Size2D(5U, 5U), Size2D(128U, 64U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
index aa36f153c5..b28df390d1 100644
--- a/tests/datasets/WinogradOutputTransformDataset.h
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -110,6 +110,13 @@ public:
add_config(TensorShape(1U, 442U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW));
add_config(TensorShape(7U, 12U, 16U, 3U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
add_config(TensorShape(24U, 49U, 16U, 2U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+
+ // (4x4, 3x3)
+ add_config(TensorShape(13U, 4U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(10U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(13U, 6U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(7U, 117U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW));
+ add_config(TensorShape(7U, 4U, 36U, 3U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
+ add_config(TensorShape(24U, 16U, 36U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(7U, 12U, 16U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW));
// (4x4, 5x5)
@@ -127,12 +134,21 @@ class LargeWinogradOutputTransformDataset final : public WinogradOutputTransform
public:
LargeWinogradOutputTransformDataset()
{
+ // (2x2, 3x3)
add_config(TensorShape(64U, 12544U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(32U, 3080U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW));
add_config(TensorShape(13U, 756U, 16U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW));
add_config(TensorShape(64U, 12544U, 16U, 3U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
add_config(TensorShape(32U, 3080U, 16U, 2U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW));
add_config(TensorShape(13U, 756U, 16U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW));
+
+ // (4x4, 3x3)
+ add_config(TensorShape(64U, 3136U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(32U, 784U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW));
+ add_config(TensorShape(13U, 196U, 36U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW));
+ add_config(TensorShape(64U, 3136U, 36U, 3U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(224U, 224U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW));
+ add_config(TensorShape(32U, 784U, 36U, 2U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NCHW));
+ add_config(TensorShape(13U, 196U, 36U, 5U), WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NCHW));
}
};
} // namespace datasets
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index b50bb94bbb..52ad417cd0 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -72,25 +72,19 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
TEST_SUITE(CL)
TEST_SUITE(ConvolutionLayer)
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
- }),
- framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
- })),
- framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(16U), 1, DataType::F32, 0)
- })),
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
+ }),
+ framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
+ })),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
@@ -110,12 +104,11 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
GPUTarget::BIFROST
})),
- framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
- input_info, weights_info, biases_info, output_info, conv_info, gpu_target, expected)
+ framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+ input_info, weights_info, output_info, conv_info, gpu_target, expected)
{
ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
&weights_info.clone()->set_is_resizable(false),
- &biases_info.clone()->set_is_resizable(false),
&output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index 532d04b431..e6a765bbe1 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp
@@ -63,25 +63,19 @@ const auto CNNDataTypes = framework::dataset::make("DataType",
TEST_SUITE(CL)
TEST_SUITE(DilatedConvolutionLayer)
-DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
- framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
- }),
- framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
- })),
- framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(19U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(21U), 1, DataType::F32, 0),
- TensorInfo(TensorShape(16U), 1, DataType::F32, 0)
- })),
+DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+ framework::dataset::make("InputInfo", { TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 2U, 1U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32, 0)
+ }),
+ framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 2U, 19U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32, 0),
+ TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16, 0)
+ })),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32, 0),
TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32, 0),
@@ -107,12 +101,11 @@ DATA_TEST_CASE(ValidateConvolutionMethod, framework::DatasetMode::ALL, zip(zip(z
Size2D(3U, 3U)
})),
- framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
- input_info, weights_info, biases_info, output_info, conv_info, gpu_target, dilation, expected)
+ framework::dataset::make("Expected", { ConvolutionMethod::GEMM, ConvolutionMethod::GEMM, ConvolutionMethod::WINOGRAD, ConvolutionMethod::GEMM, ConvolutionMethod::GEMM })),
+ input_info, weights_info, output_info, conv_info, gpu_target, dilation, expected)
{
ConvolutionMethod is_valid = CLConvolutionLayer::get_convolution_method(&input_info.clone()->set_is_resizable(false),
&weights_info.clone()->set_is_resizable(false),
- &biases_info.clone()->set_is_resizable(false),
&output_info.clone()->set_is_resizable(false), conv_info, WeightsInfo(), ActivationLayerInfo(), gpu_target, dilation);
ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
}
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 4bfe4c8f42..6e673a5f96 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -236,7 +236,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(512U, 49U, 16U, 5U), 1, DataType::F32), // Valid
TensorInfo(TensorShape(13U, 108U, 16U, 4U), 1, DataType::F32), // Padding needed
TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32), // Valid
- TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32) // Wrong WinogradInfo
+ TensorInfo(TensorShape(7U, 20U, 16U, 7U), 1, DataType::F32), // Wrong WinogradInfo
+ TensorInfo(TensorShape(7U, 256U, 36U, 3U), 1, DataType::F32), // Valid
+ TensorInfo(TensorShape(7U, 256U, 16U, 3U), 1, DataType::F32) // Wrong number of batches
}),
framework::dataset::make("BiasInfo", {
TensorInfo(TensorShape(512U), 1, DataType::F16),
@@ -245,6 +247,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(512U), 1, DataType::F32),
TensorInfo(TensorShape(13U), 1, DataType::F32),
TensorInfo(TensorShape(7U), 1, DataType::F32),
+ TensorInfo(TensorShape(7U), 1, DataType::F32),
+ TensorInfo(TensorShape(7U), 1, DataType::F32),
TensorInfo(TensorShape(7U), 1, DataType::F32)
})),
framework::dataset::make("OutputInfo", {
@@ -254,7 +258,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
TensorInfo(TensorShape(14U, 14U, 512U, 5U), 1, DataType::F32),
TensorInfo(TensorShape(17U, 23U, 13U, 4U), 1, DataType::F32),
TensorInfo(TensorShape(8U, 10U, 7U, 7U), 1, DataType::F32),
- TensorInfo(TensorShape(7U, 9U, 7U, 7U), 1, DataType::F32)
+ TensorInfo(TensorShape(7U, 9U, 7U, 7U), 1, DataType::F32),
+ TensorInfo(TensorShape(64U, 64U, 7U, 3U), 1, DataType::F32),
+ TensorInfo(TensorShape(64U, 64U, 7U, 3U), 1, DataType::F32)
})),
framework::dataset::make("WinogradInfo", {
WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
@@ -264,8 +270,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(17U, 23U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
WinogradInfo(Size2D(2U, 2U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
WinogradInfo(Size2D(2U, 3U), Size2D(3U, 3U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NCHW),
+ WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW),
+ WinogradInfo(Size2D(4U, 4U), Size2D(3U, 3U), Size2D(64U, 64U), PadStrideInfo(1, 1, 1, 1), DataLayout::NCHW)
})),
- framework::dataset::make("Expected", { false, false, false, true, false, true, false })),
+ framework::dataset::make("Expected", { false, false, false, true, false, true, false, true, false })),
input_info, bias_info, output_info, winograd_info, expected)
{
ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), winograd_info)) == expected, framework::LogLevel::ERRORS);