aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgiuros01 <giuseppe.rossini@arm.com>2019-04-01 12:07:02 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2019-04-03 09:15:23 +0000
commit3bfacb24a9b6eced921027fd1c1e3cb3757db9c7 (patch)
tree98b1923502bd5623e283ad59261e95890427214a
parentdbfc2dc182f90af5cad6fc283fff817ac7258a19 (diff)
downloadComputeLibrary-3bfacb24a9b6eced921027fd1c1e3cb3757db9c7.tar.gz
COMPMID-1318: Implementing Winograd 7x7 NHWC on OpenCL - Part III
Change-Id: I7ebf09cc12fb117834faf88cdd556d2a66eacf07 Signed-off-by: giuros01 <giuseppe.rossini@arm.com> Reviewed-on: https://review.mlplatform.org/c/926 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h4
-rw-r--r--src/core/CL/CLKernelLibrary.cpp3
-rw-r--r--src/core/CL/cl_kernels/winograd_output_transform.cl374
-rw-r--r--src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp10
-rw-r--r--tests/datasets/WinogradOutputTransformDataset.h70
-rw-r--r--tests/validation/CL/Winograd.cpp16
-rw-r--r--tests/validation/reference/Winograd.cpp1
7 files changed, 458 insertions, 20 deletions
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index f11eb2a335..4ae75a5ace 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -59,7 +59,7 @@ public:
CLWinogradConvolutionLayer &operator=(CLWinogradConvolutionLayer &&) = default;
/** Set the input and output tensors.
*
- * @note: This function only works with 3x3,3x1,1x3,5x5,5x1 and 1x5 kernels along with unit strides for both NCHW and NHWC data layout
+ * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
* @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
*
* @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index cf6d4c9843..4fa8ac4142 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -527,6 +527,9 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
{ "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
{ "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
+ { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" },
+ { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" },
+ { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
{ "yolo_layer_nchw", "yolo_layer.cl" },
{ "yolo_layer_nhwc", "yolo_layer.cl" },
{ "YUYV422_to_IYUV_bt709", "color_convert.cl" },
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index e979978fa2..cffc12d6ed 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -180,6 +180,240 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
vstore2(ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2))), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
+
+#define COMPUTE_TMP_COL_2x2_7x7(col, d0, d1, d2, d3, d4, d5, d6, d7) \
+ ({ \
+ col.s0 = d0 + d1 + d2 + d3 + d4 + d5 + d6; \
+ col.s1 = -d1 + d2 - 2 * d3 + 2 * d4 + -3 * d5 + 3 * d6 + d7; \
+ })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else /* defined(SRC_DEPTH) */
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+ int y_in = get_global_id(1);
+ int x_out = get_global_id(0);
+ int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+ int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+ int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else /* defined(SRC_DEPTH) */
+
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+ // Load the values across the channels to compose the input tile
+ DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+ DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+ DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+ DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+ DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+ DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+ DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+ DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Compute out00, out01, out02 and out03
+ float out00 = d00 + d01 + d02 + d03 + d04 + d05 + d06;
+ float out01 = -d01 + d02 - 2.f * d03 + 2.0f * d04 - 3.0f * d05 + 3.0f * d06 + d07;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+ out00 += (float)b;
+ out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+ // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Get output address
+#if defined(SRC_DEPTH)
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+ offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+ *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Get output address
+ int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+ VEC_DATA_TYPE(DATA_TYPE, 2)
+ out0_dt = ACTIVATION_FUNC(CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)));
+ *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+ DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+ DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+ DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+ DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+ DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+ DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+ DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+ DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+ DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+ DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+ DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+ DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+ DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+ DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+ DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+ DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+ DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+ DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+ DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+ DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+ DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+ DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+ DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+ DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+ DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+ DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+ DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+ DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+ DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+ DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+ DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+ DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+ DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+ DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+ DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+ DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+ DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+ DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+ DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+ DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+ DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+ DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+ DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+ DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+ DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+ DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+ DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+ DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+ DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+ DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+ DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+ DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+ DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+ DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+ DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+ // Compute the 8x2 intermediate tensor
+ VEC_DATA_TYPE(float, 2)
+ tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76);
+ COMPUTE_TMP_COL_2x2_7x7(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77);
+
+ // Compute the 2x2 output tile
+ VEC_DATA_TYPE(float, 2)
+ out_col0 = tmp_col0 + tmp_col1 + tmp_col2 + tmp_col3 + tmp_col4 + tmp_col5 + tmp_col6;
+ VEC_DATA_TYPE(float, 2)
+ out_col1 = -tmp_col1 + tmp_col2 - 2 * tmp_col3 + 2 * tmp_col4 - 3 * tmp_col5 + 3 * tmp_col6 + tmp_col7;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+ DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
+
+ out_col0 += (VEC_DATA_TYPE(float, 2))b;
+ out_col1 += (VEC_DATA_TYPE(float, 2))b;
+
+#endif // defined(HAS_BIAS)
+ // Get output address
+#if defined(SRC_DEPTH)
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+ int2 offset = (int2)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+ offset = min(offset + (int2)(0, 1) * (int2)dst_stride_z, (int2)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+ int2 mult_y = min((int2)dst_size - offset, (int2)1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+
+ // Store the output tile
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col0_dt = ACTIVATION_FUNC(CONVERT(out_col0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ out_col1_dt = ACTIVATION_FUNC(CONVERT(out_col1, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0_dt.s0;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1_dt.s0;
+
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0_dt.s1;
+ *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1_dt.s1;
+
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -227,8 +461,8 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
#else /* defined(SRC_DEPTH) */
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
#endif /* defined(SRC_DEPTH) */
// Load the values across the channels to compose the 6x6 or 6x1 tile
@@ -599,7 +833,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc(
#if defined(SRC_DEPTH)
int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
#else /* defined(SRC_DEPTH) */
- int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+ int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
#endif /* defined(SRC_DEPTH) */
offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
@@ -1231,6 +1465,72 @@ __kernel void winograd_output_transform_2x1_3x1_nchw(
#endif // defined(HAS_BIAS)
);
}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1573,6 +1873,72 @@ __kernel void winograd_output_transform_1x2_1x3_nchw(
#endif // defined(HAS_BIAS)
);
}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 069196e8c1..57c66f8a22 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -62,6 +62,11 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims,
output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
kernel_dims.height == 1 ? 1U : 4U);
}
+ else if(kernel_max_dim == 7U)
+ {
+ output_tile = Size2D(kernel_dims.width == 1 ? 1U : 7U,
+ kernel_dims.height == 1 ? 1U : 7U);
+ }
return output_tile;
}
@@ -73,7 +78,8 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
std::vector<WinogradConfiguration> fast_math_winograd =
{
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
};
auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
index 617a67de07..ea38e73adc 100644
--- a/tests/datasets/WinogradOutputTransformDataset.h
+++ b/tests/datasets/WinogradOutputTransformDataset.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -173,10 +173,10 @@ public:
}
};
-class SmallWinogradOutputTransformDatasetNHWC final : public WinogradOutputTransformDataset
+class SmallWinogradOutputTransformDatasetNHWC_F16 : public WinogradOutputTransformDataset
{
public:
- SmallWinogradOutputTransformDatasetNHWC()
+ SmallWinogradOutputTransformDatasetNHWC_F16()
{
// (4x1, 3x1)
add_config(TensorShape(13U, 12U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(7U, 6U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
@@ -225,7 +225,39 @@ public:
}
};
-class LargeWinogradOutputTransformDatasetNCHW final : public WinogradOutputTransformDataset
+class SmallWinogradOutputTransformDatasetNHWC_F32 : public SmallWinogradOutputTransformDatasetNHWC_F16
+{
+public:
+ SmallWinogradOutputTransformDatasetNHWC_F32()
+ : SmallWinogradOutputTransformDatasetNHWC_F16()
+ {
+ // (2x2, 7x7)
+ add_config(TensorShape(13U, 4U, 64U), WinogradInfo(Size2D(2U, 2U), Size2D(7U, 7U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(7U, 6U, 64U), WinogradInfo(Size2D(2U, 2U), Size2D(7U, 7U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(5U, 360U, 64U), WinogradInfo(Size2D(2U, 2U), Size2D(7U, 7U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+ add_config(TensorShape(7U, 2U, 64U, 3U), WinogradInfo(Size2D(2U, 2U), Size2D(7U, 7U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(24U, 25U, 64U, 2U), WinogradInfo(Size2D(2U, 2U), Size2D(7U, 7U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 1), DataLayout::NHWC));
+ add_config(TensorShape(7U, 2U, 64U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(7U, 7U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+
+ // (2x1, 7x1)
+ add_config(TensorShape(13U, 18U, 8U), WinogradInfo(Size2D(2U, 1U), Size2D(7U, 1U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(7U, 22U, 8U), WinogradInfo(Size2D(2U, 1U), Size2D(7U, 1U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(5U, 858U, 8U), WinogradInfo(Size2D(2U, 1U), Size2D(7U, 1U), Size2D(53U, 33U), PadStrideInfo(1, 1, 2, 0), DataLayout::NHWC));
+ add_config(TensorShape(7U, 10U, 8U, 3U), WinogradInfo(Size2D(2U, 1U), Size2D(7U, 1U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(24U, 70U, 8U, 2U), WinogradInfo(Size2D(2U, 1U), Size2D(7U, 1U), Size2D(14U, 14U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+ add_config(TensorShape(7U, 30U, 8U, 5U), WinogradInfo(Size2D(2U, 1U), Size2D(7U, 1U), Size2D(8U, 10U), PadStrideInfo(1, 1, 2, 0), DataLayout::NHWC));
+
+ // (1x2, 1x7)
+ add_config(TensorShape(13U, 18U, 8U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 7U), Size2D(9U, 9U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(7U, 30U, 8U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 7U), Size2D(10U, 11U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(5U, 848U, 8U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 7U), Size2D(53U, 33U), PadStrideInfo(1, 1, 0, 2), DataLayout::NHWC));
+ add_config(TensorShape(7U, 16U, 8U, 3U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 7U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(24U, 70U, 8U, 2U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 7U), Size2D(14U, 14U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+ add_config(TensorShape(7U, 32U, 8U, 5U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 7U), Size2D(8U, 10U), PadStrideInfo(1, 1, 0, 2), DataLayout::NHWC));
+ }
+};
+
+class LargeWinogradOutputTransformDatasetNCHW : public WinogradOutputTransformDataset
{
public:
LargeWinogradOutputTransformDatasetNCHW()
@@ -298,10 +330,10 @@ public:
}
};
-class LargeWinogradOutputTransformDatasetNHWC final : public WinogradOutputTransformDataset
+class LargeWinogradOutputTransformDatasetNHWC_F16 : public WinogradOutputTransformDataset
{
public:
- LargeWinogradOutputTransformDatasetNHWC()
+ LargeWinogradOutputTransformDatasetNHWC_F16()
{
// (4x1, 3x1)
add_config(TensorShape(64U, 12488U, 6U), WinogradInfo(Size2D(4U, 1U), Size2D(3U, 1U), Size2D(224U, 223U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
@@ -346,6 +378,32 @@ public:
add_config(TensorShape(13U, 784U, 8U, 5U), WinogradInfo(Size2D(1U, 4U), Size2D(1U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
}
};
+
+class LargeWinogradOutputTransformDatasetNHWC_F32 : public LargeWinogradOutputTransformDatasetNHWC_F16
+{
+public:
+ LargeWinogradOutputTransformDatasetNHWC_F32()
+ {
+ // (2x2, 7x7)
+ add_config(TensorShape(32U, 756U, 64U), WinogradInfo(Size2D(2U, 2U), Size2D(5U, 5U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+ add_config(TensorShape(13U, 182U, 64U), WinogradInfo(Size2D(2U, 2U), Size2D(5U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+ add_config(TensorShape(32U, 756U, 64U, 2U), WinogradInfo(Size2D(2U, 2U), Size2D(5U, 5U), Size2D(112U, 112U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+ add_config(TensorShape(13U, 182U, 64U, 5U), WinogradInfo(Size2D(2U, 2U), Size2D(5U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+
+ // (2x1, 7x1)
+ add_config(TensorShape(32U, 3136U, 8U), WinogradInfo(Size2D(2U, 1U), Size2D(5U, 1U), Size2D(112U, 112U), PadStrideInfo(1, 1, 2, 0), DataLayout::NHWC));
+ add_config(TensorShape(13U, 784U, 8U), WinogradInfo(Size2D(2U, 1U), Size2D(5U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+ add_config(TensorShape(32U, 3024U, 8U, 2U), WinogradInfo(Size2D(2U, 1U), Size2D(5U, 1U), Size2D(112U, 112U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(13U, 784U, 8U, 5U), WinogradInfo(Size2D(2U, 1U), Size2D(5U, 1U), Size2D(56U, 56U), PadStrideInfo(1, 1, 1, 0), DataLayout::NHWC));
+
+ // (1x2, 1x7)
+ add_config(TensorShape(32U, 3136U, 8U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 5U), Size2D(112U, 112U), PadStrideInfo(1, 1, 0, 2), DataLayout::NHWC));
+ add_config(TensorShape(13U, 784U, 8U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+ add_config(TensorShape(32U, 3024U, 8U, 2U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 5U), Size2D(112U, 112U), PadStrideInfo(1, 1, 0, 0), DataLayout::NHWC));
+ add_config(TensorShape(13U, 784U, 8U, 5U), WinogradInfo(Size2D(1U, 2U), Size2D(1U, 5U), Size2D(56U, 56U), PadStrideInfo(1, 1, 0, 1), DataLayout::NHWC));
+ }
+};
+
} // namespace datasets
} // namespace test
} // namespace arm_compute
diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 1042dd7e08..62f0335253 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp
@@ -157,11 +157,15 @@ const auto LargeWinogradFilterTransformDatasetNHWC_F32 =
// Output transform
const auto SmallWinogradOutputTransformDatasetNCHW = datasets::SmallWinogradOutputTransformDatasetNCHW();
-const auto SmallWinogradOutputTransformDatasetNHWC = datasets::SmallWinogradOutputTransformDatasetNHWC();
+const auto SmallWinogradOutputTransformDatasetNHWC_F16 = datasets::SmallWinogradOutputTransformDatasetNHWC_F16();
+
+const auto SmallWinogradOutputTransformDatasetNHWC_F32 = datasets::SmallWinogradOutputTransformDatasetNHWC_F32();
const auto LargeWinogradOutputTransformDatasetNCHW = datasets::LargeWinogradOutputTransformDatasetNCHW();
-const auto LargeWinogradOutputTransformDatasetNHWC = datasets::LargeWinogradOutputTransformDatasetNHWC();
+const auto LargeWinogradOutputTransformDatasetNHWC_F16 = datasets::LargeWinogradOutputTransformDatasetNHWC_F16();
+
+const auto LargeWinogradOutputTransformDatasetNHWC_F32 = datasets::LargeWinogradOutputTransformDatasetNHWC_F32();
//Activation Functions
const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
@@ -514,7 +518,7 @@ TEST_SUITE_END() // NCHW
TEST_SUITE(NHWC)
TEST_SUITE(FP16)
FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::ALL,
- combine(combine(SmallWinogradOutputTransformDatasetNHWC,
+ combine(combine(SmallWinogradOutputTransformDatasetNHWC_F16,
framework::dataset::make("DataType", { DataType::F16 })),
framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
{
@@ -523,7 +527,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP16, framework
}
FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP16, framework::DatasetMode::NIGHTLY,
- combine(combine(LargeWinogradOutputTransformDatasetNHWC,
+ combine(combine(LargeWinogradOutputTransformDatasetNHWC_F16,
framework::dataset::make("DataType", { DataType::F16 })),
framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
{
@@ -533,7 +537,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP16, framework
TEST_SUITE_END() // FP16
TEST_SUITE(FP32)
FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::ALL,
- combine(combine(SmallWinogradOutputTransformDatasetNHWC,
+ combine(combine(SmallWinogradOutputTransformDatasetNHWC_F32,
framework::dataset::make("DataType", { DataType::F32 })),
framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
{
@@ -542,7 +546,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixtureFP32, framework
}
FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixtureFP32, framework::DatasetMode::NIGHTLY,
- combine(combine(LargeWinogradOutputTransformDatasetNHWC,
+ combine(combine(LargeWinogradOutputTransformDatasetNHWC_F32,
framework::dataset::make("DataType", { DataType::F32 })),
framework::dataset::make("ActivationInfo",{ ActivationLayerInfo() }) ))
{
diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 5525bc4535..06463d577f 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp
@@ -205,6 +205,7 @@ void initialize_matrix_transform(SimpleTensor<T> &src, const Size2D &output_tile
{ WinogradKey(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
{ WinogradKey(std::pair<int, int>(2, 1), std::pair<int, int>(7, 1), WinogradTransformType::OUTPUT), omatrix2x1_7x7 },
{ WinogradKey(std::pair<int, int>(1, 2), std::pair<int, int>(1, 7), WinogradTransformType::OUTPUT), omatrix2x1_7x7 },
+ { WinogradKey(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7), WinogradTransformType::OUTPUT), omatrix2x1_7x7 },
{ WinogradKey(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5), WinogradTransformType::OUTPUT), omatrix4x4_5x5 },
};