aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-06-28 16:29:29 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit215b4ea6c9dee480a22070d5873b0b8cb52531a0 (patch)
tree398e552c4d01c0b84d03a873098a9183ba8f82e4 /src
parentad486e21e5870f41774f30825c270762e08ae71e (diff)
downloadComputeLibrary-215b4ea6c9dee480a22070d5873b0b8cb52531a0.tar.gz
COMPMID-1277 - Optimizing CLIm2ColKernel for NHWC.
This patch includes: - Im2Col optimizations for NHWC using a new data layout - Refactoring of CLIm2ColKernel adding validation method and auto-init - Removed im2col_reduced from CLIm2ColKernel and created a new kernel CLFlattenLayerKernel Change-Id: I1620640b6796baa268324b33ae92cdd8de53e27c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141241 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/CLKernelLibrary.cpp21
-rw-r--r--src/core/CL/cl_kernels/convolution_layer.cl74
-rw-r--r--src/core/CL/cl_kernels/flatten.cl57
-rw-r--r--src/core/CL/cl_kernels/im2col.cl529
-rw-r--r--src/core/CL/kernels/CLFlattenLayerKernel.cpp151
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp474
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp31
-rw-r--r--src/runtime/CL/functions/CLFlattenLayer.cpp12
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp34
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp3
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp54
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp6
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp4
14 files changed, 773 insertions, 683 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 64519ff459..29b01e6cea 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -226,6 +226,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
{ "erode", "erode.cl" },
{ "fast_corners", "fast_corners.cl" },
+ { "flatten", "flatten.cl" },
{ "fill_image_borders_constant", "fill_border.cl" },
{ "fill_image_borders_replicate", "fill_border.cl" },
{ "finalize", "optical_flow_pyramid_lk.cl" },
@@ -270,13 +271,12 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "hog_detector", "hog.cl" },
{ "hog_orientation_binning", "hog.cl" },
{ "hysteresis", "canny.cl" },
- { "im2col1x1_stridex1_dchw", "im2col.cl" },
- { "im2col3x3_dchw", "im2col.cl" },
- { "im2col5x5_dchw", "im2col.cl" },
- { "im2col11x11_padx0_pady0_dchw", "im2col.cl" },
- { "im2col_generic_dchw", "im2col.cl" },
- { "im2col_generic_padx0_pady0_dchw", "im2col.cl" },
- { "im2col_reduced_dchw", "im2col.cl" },
+ { "im2col1x1_stridex1_nchw", "im2col.cl" },
+ { "im2col3x3_nchw", "im2col.cl" },
+ { "im2col5x5_nchw", "im2col.cl" },
+ { "im2col11x11_padx0_pady0_nchw", "im2col.cl" },
+ { "im2col_generic_nchw", "im2col.cl" },
+ { "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
{ "im2col3x3_nhwc", "im2col.cl" },
{ "im2col_generic_nhwc", "im2col.cl" },
{ "init_level", "optical_flow_pyramid_lk.cl" },
@@ -333,8 +333,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "remap_nearest_neighbour", "remap.cl" },
{ "remap_bilinear", "remap.cl" },
{ "reshape_layer", "reshape_layer.cl" },
- { "reshape_to_columns_nchw", "convolution_layer.cl" },
- { "reshape_to_columns_nhwc", "convolution_layer.cl" },
+ { "reshape_to_columns", "convolution_layer.cl" },
{ "RGB888_to_IYUV_bt709", "color_convert.cl" },
{ "RGB888_to_NV12_bt709", "color_convert.cl" },
{ "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
@@ -572,6 +571,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
#include "./cl_kernels/fast_corners.clembed"
},
{
+ "flatten.cl",
+#include "./cl_kernels/flatten.clembed"
+ },
+ {
"fill_border.cl",
#include "./cl_kernels/fill_border.clembed"
},
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 9335b047fe..2b75b45fe1 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -53,7 +53,7 @@
* @param[in] total_filters Total number of filters. 4th dimension of the weights matrix
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
*/
-__kernel void reshape_to_columns_nchw(
+__kernel void reshape_to_columns(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
#ifdef HAS_BIAS
@@ -109,74 +109,4 @@ __kernel void reshape_to_columns_nchw(
}
}
}
-
-/** This kernel reshapes the tensor's low three dimensions to single column
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] bias_ptr Pointer to the bias tensor. Same as @p src_ptr
- * @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes)
- * @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] depth The depth of the input tensor
- * @param[in] width The width of the input tensor
- * @param[in] height The height of the input tensor
- * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix
- */
-__kernel void reshape_to_columns_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
-#ifdef HAS_BIAS
- VECTOR_DECLARATION(bias),
-#endif /* HAS_BIAS */
- uint depth, uint width, uint height, uint total_filters, uint dst_stride_z)
-{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- bool is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
-
- __global uchar *tmp_src_ptr = src.ptr;
- __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * width * dst_stride_y + get_global_id(
- 0) * width * height * dst_stride_y;
-#ifdef HAS_BIAS
- __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
-#endif /* HAS_BIAS */
-
- if(is_last_thread)
- {
- for(uint i = 0; i < total_filters; ++i)
- {
- *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
-
-#ifdef HAS_BIAS
- *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
- tmp_bias_ptr += bias_stride_x;
-#endif /* HAS_BIAS */
- tmp_src_ptr += height * src_stride_z;
- tmp_dst_ptr += dst_stride_x;
- }
- }
- else
- {
- for(uint i = 0; i < total_filters; ++i)
- {
- *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
- tmp_src_ptr += height * src_stride_z;
- tmp_dst_ptr += dst_stride_x;
- }
- }
-}
-#endif // defined(DATA_TYPE) && defined(NUM_GROUPS) \ No newline at end of file
+#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl
new file mode 100644
index 0000000000..df0f9c4886
--- /dev/null
+++ b/src/core/CL/cl_kernels/flatten.cl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+
+/** This opencl kernel flattens the first 3 dimensions of the input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void flatten(
+ TENSOR3D_DECLARATION(src),
+ VECTOR_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (int)SRC_WIDTH + get_global_id(2) * (int)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(
+ DATA_TYPE);
+
+ *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index d034b30b68..274ec20046 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -35,13 +35,12 @@
#error "Element size not support"
#endif // ELEMENT_SIZE
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
*
- * @note This kernel computes 4 elements
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
@@ -62,16 +61,16 @@
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col1x1_stridex1_dchw(
+__kernel void im2col1x1_stridex1_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
- const uint yc = get_global_id(1); // y coordinate in the convolved tensor
- const uint ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
+ const uint yc = get_global_id(1); // y coordinate in the convolved tensor
+ const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
// Clamp xc
// The strategy clamps at "xc" as it will be a valid value for sure
@@ -107,7 +106,7 @@ __kernel void im2col1x1_stridex1_dchw(
*(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
@@ -116,18 +115,16 @@ __kernel void im2col1x1_stridex1_dchw(
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#if defined(DILATION_X) && defined(DILATION_Y)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
@@ -151,183 +148,65 @@ __kernel void im2col1x1_stridex1_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_generic_nhwc(
+__kernel void im2col_generic_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int src_stride_y_int = (int)src_stride_y;
- const int src_stride_z_int = (int)src_stride_z;
- const int xc = get_global_id(1); // x coordinate in the convolved tensor
- const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
- const int ch = get_global_id(0); // input feature map
- const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X - PAD_LEFT;
const int yi = yc * STRIDE_Y - PAD_TOP;
// Calculate output indices
- const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH;
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+ // Linearize convolution elements
for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
{
- const int dilated_offset_y = yk * DILATION_Y;
- const int y0 = yi + dilated_offset_y;
- if(y0 >= 0 && y0 < SRC_HEIGHT)
+ int y = yi + yk * DILATION_Y;
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
{
- int xk;
- for(xk = 0; xk < KERNEL_WIDTH; xk++)
+ int x = xi + xk * DILATION_X;
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
{
- const int dilated_offset_x = xk * DILATION_X;
- const int x0 = xi + dilated_offset_x;
- if(x0 >= 0 && x0 < SRC_WIDTH)
- {
- *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + dilated_offset_x * src_stride_y + dilated_offset_y * src_stride_z, DATA_TYPE);
- }
- else
- {
- *((__global DATA_TYPE *)output_ptr) = PAD_VALUE;
- }
- output_ptr += 1 * sizeof(DATA_TYPE);
+ *output_ptr = PAD_VALUE;
}
- }
- else
- {
- for(int xk = 0; xk < KERNEL_WIDTH; xk++)
+ else
{
- *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE;
- output_ptr += 1 * dst_stride_x;
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
}
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
}
}
-#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
- {
- *((__global DATA_TYPE *)output_ptr) = 1.0f;
- output_ptr += 1 * dst_stride_x;
- }
-#endif // HAS_BIAS
-}
-
-/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- uint src_stride_w,
- uint dst_stride_w)
-{
- const int src_stride_y_int = (int)src_stride_y;
- const int src_stride_z_int = (int)src_stride_z;
- const int xc = get_global_id(1); // x coordinate in the convolved tensor
- const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
- const int ch = get_global_id(0); // input feature map
- const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
-
- // Calculate input indices
- const int xi = xc * STRIDE_X - PAD_LEFT;
- const int yi = yc * STRIDE_Y - PAD_TOP;
-
- // Calculate output indices
- const int xo = ch * 9; // 3x3
- const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-
- VEC_DATA_TYPE(DATA_TYPE, 3)
- row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
- VEC_DATA_TYPE(DATA_TYPE, 3)
- row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
- VEC_DATA_TYPE(DATA_TYPE, 3)
- row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
-
- const int3 y = (int3)yi + (int3)(0, 1, 2);
- // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer.
- if(y.s0 >= 0 && y.s0 < SRC_HEIGHT)
- {
- row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
- PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE));
- }
-
- if(y.s1 >= 0 && y.s1 < SRC_HEIGHT)
- {
- row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
- PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE));
- }
-
- if(y.s2 >= 0 && y.s2 < SRC_HEIGHT)
- {
- row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
- PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE));
- }
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
- // Put 0 if the value is out-of-bound
- const int3 x = (int3)xi + (int3)(0, 1, 2);
- VEC_DATA_TYPE(COND_DATA_TYPE, 3)
- cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
- row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
- row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0);
- row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
- vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
- *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
- *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+ *output_ptr = 1.0f;
}
#endif // HAS_BIAS
}
+#endif // defined(DILATION_X) && defined(DILATION_Y)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3
+/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
@@ -350,16 +229,16 @@ __kernel void im2col3x3_nhwc(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col3x3_dchw(
+__kernel void im2col3x3_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X - PAD_LEFT;
@@ -402,19 +281,19 @@ __kernel void im2col3x3_dchw(
*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
}
#endif // HAS_BIAS
}
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
@@ -437,16 +316,16 @@ __kernel void im2col3x3_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col5x5_dchw(
+__kernel void im2col5x5_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X - PAD_LEFT;
@@ -576,20 +455,20 @@ __kernel void im2col5x5_dchw(
}
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)output_ptr) = 1.0f;
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
@@ -610,16 +489,16 @@ __kernel void im2col5x5_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col11x11_padx0_pady0_dchw(
+__kernel void im2col11x11_padx0_pady0_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X;
@@ -776,21 +655,21 @@ __kernel void im2col11x11_padx0_pady0_dchw(
}
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)output_ptr) = 1.0f;
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
- * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
@@ -810,16 +689,16 @@ __kernel void im2col11x11_padx0_pady0_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_generic_padx0_pady0_dchw(
+__kernel void im2col_generic_padx0_pady0_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X;
@@ -855,25 +734,25 @@ __kernel void im2col_generic_padx0_pady0_dchw(
} /* End of loop over KERNEL_HEIGHT */
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*output_ptr = 1.0f;
}
#endif // HAS_BIAS
}
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
*
+ * @note This kernel computes VECTOR_SIZE elements
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
@@ -893,64 +772,154 @@ __kernel void im2col_generic_padx0_pady0_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_generic_dchw(
+__kernel void im2col3x3_nhwc(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
// Calculate input indices
- const int xi = xc * STRIDE_X - PAD_LEFT;
- const int yi = yc * STRIDE_Y - PAD_TOP;
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
- // Calculate output indices
- const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
- const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
- __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-
- // Linearize convolution elements
- for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
- {
- int y = yi + yk * DILATION_Y;
- for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
- {
- int x = xi + xk * DILATION_X;
-#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
- {
- *output_ptr = PAD_VALUE;
- }
- else
- {
- *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
- }
-#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- }
- }
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int3 offset = 0;
+
+ // Clamp xi
+ int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+ xi_offset *= (int3)src_stride_y;
+
+ // Out-of-bound condition for X
+ int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+ // yi == 0
+ // Clamp yi
+ // yi_coord is casted to unsigned int in order to use just a min() operation
+ // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+ yi_coord = yi - (int)PAD_TOP;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 1
+ // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+ yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with zeros
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 1) >= (uint)(SRC_HEIGHT));
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 2
+ // Clamp yi_coord
+ yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 2) >= (uint)(SRC_HEIGHT));
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // Store
+ VSTORE(VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH);
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
{
- *output_ptr = 1.0f;
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
- * the kernel width and height are the same of width and height of the input tensor
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
*
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -958,35 +927,75 @@ __kernel void im2col_generic_dchw(
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width The width of the input tensor
- * @param[in] height The height of the input tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_reduced_dchw(
+__kernel void im2col_generic_nhwc(
TENSOR3D_DECLARATION(src),
- VECTOR_DECLARATION(dst),
- uint width, uint height)
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
- const uint image_size = width * height;
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int i = 0;
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+ {
+ // Clamp yi_coord
+ int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+ yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
- __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
+ // Out-of-bound condition for Y
+ int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
- *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+ {
+ // Clamp xi_coord
+ int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+ xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+ // Out-of-bound condition for X
+ int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+ // Replace with PAD_VALUE if the value is out-of-bound
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition));
+
+ // Store
+ VSTORE(VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+
+ i++;
+ }
+ }
#ifdef HAS_BIAS
- // If it is the last thread in the 3 dimensional workgroup
- if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
{
- tmp_out_ptr += dst_stride_x;
- *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f;
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
}
#endif // HAS_BIAS
}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
new file mode 100644
index 0000000000..0b5feffcc9
--- /dev/null
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_flatten_shape(input));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input)));
+
+ Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFlattenLayerKernel::CLFlattenLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLFlattenLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("flatten", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "flatten";
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLFlattenLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+ Window out_slice = out_window.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_3D();
+
+ // Run kernel
+ do
+ {
+ // Set arguments
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index a09129bba6..39654e2190 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -31,7 +31,6 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -40,12 +39,22 @@
#include <cmath>
#include <tuple>
+#include <utility>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
+struct Im2ColConfiguration
+{
+ std::string kernel_name{};
+ std::set<std::string> build_options{};
+ unsigned int num_elems_processed_per_iteration{};
+ bool is_padding_required_nchw{};
+};
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -54,263 +63,255 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, b
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- // Checks performed when output is configured
- if(output->total_size() != 0)
+ if(output->total_size() > 0)
{
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
}
-inline bool run_im2col_reduced(ITensorInfo *input, ITensorInfo *output, const PadStrideInfo &conv_info)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw)
{
- int stride_x = 0;
- int stride_y = 0;
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- std::tie(stride_x, stride_y) = conv_info.stride();
+ // Output tensor auto initialization if not yet initialized
+ TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true);
- return (output->dimension(0) == (input->dimension(0) * input->dimension(1) * input->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
- && (std::equal(input->tensor_shape().cbegin() + 3,
- input->tensor_shape().cend(),
- output->tensor_shape().cbegin() + 1))
- && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
-}
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
-} // namespace
+ const DataLayout data_layout = input->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = input->dimension(width_idx);
+ const unsigned int input_height = input->dimension(height_idx);
-CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _conv_info(), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr), _kernel_dims()
-{
-}
+ // Configure the execute window based on the selected optimal OpenCL kernel
+ bool window_changed = false;
+ Window win;
-std::string
-CLIm2ColKernel::configure_window(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims,
- const Size2D &dilation, const PadStrideInfo &conv_info, CLBuildOptions &build_opts)
-{
- std::string kernel_name;
- bool is_optimized_path = false;
- const bool reduced = run_im2col_reduced(input->info(), output->info(), conv_info);
- const DataType data_type = input->info()->data_type();
- const bool squared_im2col = kernel_dims.width == kernel_dims.height;
- const DataLayout data_layout = input->info()->data_layout();
+ if(data_layout == DataLayout::NHWC)
+ {
+ win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const unsigned int input_width = input->info()->dimension(width_idx);
- const unsigned int input_height = input->info()->dimension(height_idx);
- const unsigned int input_channel = input->info()->dimension(channel_idx);
+ const int xin_start = 0;
+ const int xin_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration) : input->dimension(0);
+ const int yin_start = 0;
+ const int yin_end = input->dimension(1);
- if(!reduced)
+ const int xout_start = 0;
+ const int xout_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration) : output->dimension(0);
+ const int yout_start = 0;
+ const int yout_end = output->dimension(1);
+
+ AccessWindowStatic input_access(input, xin_start, yin_start, xin_end, yin_end);
+ AccessWindowStatic output_access(output, xout_start, yout_start, xout_end, yout_end);
+ window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
+ }
+ else
{
- // Default kernel name
- if(data_layout == DataLayout::NCHW)
+ if(is_padding_required_nchw)
{
- kernel_name = "im2col_generic_dchw";
+ const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
+ win = calculate_max_window(*input,
+ Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+ AccessWindowStatic input_access(input,
+ -border.left,
+ -border.top,
+ ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+ input_height + border.bottom);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
}
else
{
- kernel_name = "im2col_generic_nhwc";
+ // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
+ // update_window_and_padding() can be skipped
+ win = calculate_max_window(*input, Steps());
}
+ }
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+ win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
- _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
- build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
- build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
- build_opts.add_option("-DKERNEL_DEPTH=" + support::cpp11::to_string(input_channel));
- build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first));
- build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
- build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
- build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
- build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+{
+ const DataLayout data_layout = input->data_layout();
+ const DataType data_type = input->data_type();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int input_width = input->dimension(width_idx);
+ const unsigned int input_height = input->dimension(height_idx);
+ const unsigned int input_channel = input->dimension(channel_idx);
+
+ const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+
+ // Im2Col configuration
+ std::string kernel_name = "im2col_generic_";
+ CLBuildOptions build_opts;
+ unsigned int num_elems_processed_per_iteration = 1;
+ bool is_padding_required_nchw = false;
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size()));
+ build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+ build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+ build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
+ build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+ build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+ build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->quantization_info().offset), "-DPAD_VALUE=0");
+ build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+
+ if(data_layout == DataLayout::NHWC)
+ {
+ num_elems_processed_per_iteration = 2;
+ is_padding_required_nchw = false;
+
+ // Only the 3x3 case is optimized for NHWC
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ kernel_name = "im2col3x3_";
+ }
+
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));
+ }
+ else
+ {
if(dilation == Size2D(1U, 1U))
{
+ const bool squared_im2col = kernel_dims.width == kernel_dims.height;
if(squared_im2col)
{
- // Check if we can run an optimized im2col
+ // Check if we can run an optimized im2col for NCHW
switch(kernel_dims.width)
{
case 1:
// Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
- if(conv_info.stride().first == 1 && !conv_info.has_padding() && data_layout == DataLayout::NCHW)
+ if(conv_info.stride().first == 1 && !conv_info.has_padding())
{
- // Set hint for LWS
- _lws_hint = cl::NDRange(1, 1, 8);
- _num_elems_processed_per_iteration = 4;
- is_optimized_path = true;
- kernel_name = "im2col1x1_stridex1_dchw";
+ kernel_name = "im2col1x1_stridex1_";
+ num_elems_processed_per_iteration = 4;
+ is_padding_required_nchw = true;
}
break;
case 3:
- _lws_hint = cl::NDRange(1, 1, 8);
- _num_elems_processed_per_iteration = 1;
- is_optimized_path = true;
- switch(data_layout)
- {
- case DataLayout::NCHW:
- kernel_name = "im2col3x3_dchw";
- break;
- case DataLayout::NHWC:
- kernel_name = "im2col3x3_nhwc";
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported.");
- break;
- }
+ kernel_name = "im2col3x3_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
break;
case 5:
- _num_elems_processed_per_iteration = 1;
- switch(data_layout)
- {
- case DataLayout::NCHW:
- is_optimized_path = true;
- kernel_name = "im2col5x5_dchw";
- break;
- default:
- // using generic_nhwc
- is_optimized_path = false;
- break;
- }
+ kernel_name = "im2col5x5_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
break;
case 11:
- _num_elems_processed_per_iteration = 1;
// Optimized im2col11x11 if pad_x = pad_y = 0
- if(!conv_info.has_padding() && data_layout == DataLayout::NCHW)
+ if(!conv_info.has_padding())
{
- is_optimized_path = true;
- kernel_name = "im2col11x11_padx0_pady0_dchw";
+ kernel_name = "im2col11x11_padx0_pady0_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
}
break;
default:
- is_optimized_path = false;
+ kernel_name = "im2col_generic_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = false;
break;
}
}
else if(kernel_dims.width > 1 && !conv_info.has_padding())
{
- _num_elems_processed_per_iteration = 1;
- is_optimized_path = false;
-
- if(data_layout == DataLayout::NCHW)
- {
- kernel_name = "im2col_generic_padx0_pady0_dchw";
-
- // Optimized im2col is performed using one or more vector operations with the specified vector size
- // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
- // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
- // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
- // Using the vector size of 8, however, may be faster.
- size_t vector_size = 4;
- // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
- // is used instead.)
- if(kernel_dims.width < vector_size)
- {
- vector_size = kernel_dims.width;
- }
- // Vector size optimized for the 11x11 AlexNet convolution on Bifrost.
- const GPUTarget gpu_target = get_target();
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && kernel_dims.width == 11)
- {
- vector_size = 8;
- }
- const size_t width_mod_vector_size = kernel_dims.width % vector_size;
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
- }
+ kernel_name = "im2col_generic_padx0_pady0_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = false;
+
+ // Optimized im2col is performed using one or more vector operations with the specified vector size
+ // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+ // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+ // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+ // Using the vector size of 8, however, may be faster.
+ // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+ // is used instead.)
+ const size_t vector_size = std::min(static_cast<size_t>(4), kernel_dims.width);
+ const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+ build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
}
}
- _run_func = &CLIm2ColKernel::run_generic;
- }
- else
- {
- _num_elems_processed_per_iteration = 1;
- kernel_name = "im2col_reduced_dchw";
- _run_func = &CLIm2ColKernel::run_reduced;
- }
- // Configure kernel window
- Window win;
- if(is_optimized_path)
- {
- if(data_layout == DataLayout::NHWC)
- {
- win = calculate_max_window(*input->info(),
- Steps(_num_elems_processed_per_iteration),
- false,
- BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()));
- const int x = -conv_info.pad_left();
- const int y = -conv_info.pad_top();
- const int h = kernel_dims.width * _num_elems_processed_per_iteration;
- const int w = 1;
- AccessWindowRectangle input_access(input->info(), x, y, w, h);
- update_window_and_padding(win, input_access);
- }
- else
- {
- const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
- win = calculate_max_window(*input->info(),
- Steps(_num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
- AccessWindowStatic input_access(input->info(),
- -border.left,
- -border.top,
- ceil_to_multiple(input_width + border.right, kernel_dims.width * _num_elems_processed_per_iteration),
- input_height + border.bottom);
- update_window_and_padding(win, input_access);
- }
- }
- else
- {
- // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
- // update_window_and_padding() can be skipped
- win = calculate_max_window(*input->info(), Steps());
}
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- if(!reduced)
- {
- // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
- win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
- }
- ICLKernel::configure(win);
- return kernel_name;
+ // Append the data layout to the kernel_name
+ kernel_name += lower_string(string_from_data_layout(data_layout));
+
+ Im2ColConfiguration im2col_config;
+ im2col_config.kernel_name = kernel_name;
+ im2col_config.build_options = build_opts.options();
+ im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+ im2col_config.is_padding_required_nchw = is_padding_required_nchw;
+
+ return im2col_config;
+}
+} // namespace
+
+CLIm2ColKernel::CLIm2ColKernel()
+ : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info()
+{
}
void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation));
- _input = input;
- _output = output;
- _kernel_dims = kernel_dims;
- _conv_info = conv_info;
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = input->info()->dimension(width_idx);
+ const unsigned int input_height = input->info()->dimension(height_idx);
- const DataType data_type = input->info()->data_type();
+ // Select and configure the optimal OpenCL kernel to run.
+ // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
+ // and the padding requirement flag
+ Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation);
// Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
- build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(im2col_config.kernel_name, im2col_config.build_options));
- _num_elems_processed_per_iteration = 1;
+ _input = input;
+ _output = output;
+ _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+ _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
+ _kernel_dims = kernel_dims; // Only needed by the Tuner
+ _conv_info = conv_info; // Only needed by the Tuner
- const std::string kernel_name = configure_window(input, output, kernel_dims, dilation, conv_info, build_opts);
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+ im2col_config.is_padding_required_nchw);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
// Set config_id for enabling LWS tuning
- _config_id = kernel_name;
+ _config_id = im2col_config.kernel_name;
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->info()->data_type()));
_config_id += "_";
@@ -323,31 +324,24 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
- ARM_COMPUTE_UNUSED(kernel_dims);
- ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_UNUSED(has_bias);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation));
+ Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+ im2col_config.is_padding_required_nchw)
+ .first);
return Status{};
}
void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
{
- ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
- (this->*_run_func)(window, queue);
-}
-
-void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
-{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const DataLayout data_layout = _input->info()->data_layout();
// Get initial windows
+ // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- // Change the Z dimension's step back to 1
window_collapsed.set_dimension_step(Window::DimZ, 1);
Window window_output;
@@ -359,36 +353,32 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
Window slice_in = first_slice_3d;
Window slice_out = window_output.first_slice_window_2D();
- const bool out_dim_not_same_input_dim = _convolved_dims.first != _input->info()->dimension(width_idx) || _convolved_dims.second != _input->info()->dimension(height_idx);
+ if(data_layout == DataLayout::NHWC)
+ {
+ const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
+ const int num_batches = tmp_win[3].end();
- // Setup slice if convolved dims are not the same as input dims
- if(out_dim_not_same_input_dim)
+ slice.set(1, Window::Dimension(0, static_cast<int>(_output->info()->tensor_shape()[1]), 1));
+ slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
+ }
+ else
{
- // If the stride_x or stride_y are not 1, the output tensor of matrix multiply (Convolved tensor) will not
- // have the same shape of the im2col input tensor
- // In this case we need to re-compute the window using the shape of the tensor after matrix multiply (convolved_dims)
- slice.set(width_idx, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
- if(data_layout == DataLayout::NHWC)
- {
- // if layout is NHWC, we need to multiply convolved_dims.height by the number of batches as for this
- // format we collapsed HEIGHT and all subsequent dimensions (batches) together. This is necessary to ensure
- // global_id(2) values are in the correct range.
- const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
- const int num_batches = tmp_win[3].end();
- slice.set(height_idx, Window::Dimension(0, static_cast<int>(_convolved_dims.second) * num_batches, 1));
- }
- else
- {
- slice.set(height_idx, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
- }
+ slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
+ slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+ // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
}
// Setup input slice
- // The first three dimensions of the input are increased by the inner loops
+ // The dimensions of the input are increased within the OpenCL kernel
slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ // Setup output slice
+ // The dimensions of the output are increased within the OpenCL kernel
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
do
{
unsigned int idx = 0;
@@ -399,30 +389,4 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
enqueue(queue, *this, slice, _lws_hint);
}
while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-
-void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Window out_slice = out_window.first_slice_window_1D();
- Window in_slice = window.first_slice_window_3D();
-
- // Run kernel
- do
- {
- // Set arguments
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _output, out_slice);
-
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
- enqueue(queue, *this, in_slice, _lws_hint);
- }
- while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
-}
+} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 9df91fccde..58ecd9ccb3 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -87,8 +87,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
(biases != nullptr) ? biases->info() : nullptr,
output->info(), num_groups));
- const DataType data_type = input->info()->data_type();
- const DataLayout data_layout = input->info()->data_layout();
+ const DataType data_type = input->info()->data_type();
_biases = biases;
_output = output;
@@ -101,8 +100,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
// Create kernel
- std::string kernel_name = std::string("reshape_to_columns_") + lower_string(string_from_data_layout(data_layout));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts.options()));
// Set static arguments
unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 61010711a6..8cb4f4b889 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -49,29 +49,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
- TensorShape expected_output_shape;
- if(is_flatten) /* Called by FlattenLayer */
+ if(output->total_size() > 0)
{
- expected_output_shape = misc::shape_calculator::compute_im2col_flatten_shape(input);
- }
- else if(!is_fully_connected) /* Called by ConvolutionLayer */
- {
- expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false);
- }
- else /* Called by FullyConnectedLayer */
- {
- const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
- const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
+ TensorShape expected_output_shape;
- expected_output_shape = misc::shape_calculator::compute_im2col_fc_shape(input, num_input_dimensions);
- }
+ if(is_flatten || is_fully_connected)
+ {
+ expected_output_shape = misc::shape_calculator::compute_flatten_shape(input);
+ }
+ else
+ {
+ expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false);
+ }
- TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+ TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
return Status{};
}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index f5809a218a..b372c35dd9 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -23,8 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
@@ -32,8 +31,13 @@ using namespace arm_compute;
void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
- k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+ auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>();
+ k->configure(input, output);
_kernel = std::move(k);
CLScheduler::get().tune_kernel_static(*_kernel);
}
+
+Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLFlattenLayerKernel::validate(input, output);
+} \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 6fd78a3fc9..60c28a0874 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -73,12 +73,11 @@ Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c
}
CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _im2col_kernel(), _convert_weights(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
- _accumulate_biases_kernel(), _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+ : _memory_group(memory_manager), _convert_weights(), _flatten_layer(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+ _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
_is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
{
}
-
void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
if(_is_quantized)
@@ -111,20 +110,19 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
- // Initialize output tensor for im2col
- TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+ // Initialize output tensor for flatten
+ TensorShape shape_flatten = compute_flatten_shape(input->info());
+ _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW));
- // Configure im2col kernel
- _memory_group.manage(&_im2col_output);
- _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
- CLScheduler::get().tune_kernel_static(_im2col_kernel);
+ // Configure flatten kernel
+ _memory_group.manage(&_flatten_output);
+ _flatten_layer.configure(input, &_flatten_output);
// Configure matrix multiply kernel
- configure_mm(&_im2col_output, weights, output);
+ configure_mm(&_flatten_output, weights, output);
- // Allocate the output tensor for im2col once all the configure methods have been called
- _im2col_output.allocator()->allocate();
+ // Allocate the output tensor for flatten once all the configure methods have been called
+ _flatten_output.allocator()->allocate();
}
void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
@@ -254,7 +252,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const GPUTarget gpu_target = CLScheduler::get().target();
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)).set_data_layout(DataLayout::NCHW));
+ const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -311,9 +309,9 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
// Fully Connected layer after a Convolution Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
- // Validate im2col kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false));
- input_to_use = &im2col_input;
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
+ input_to_use = &flatten_input;
}
else
{
@@ -341,7 +339,7 @@ void CLFullyConnectedLayer::run()
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
- CLScheduler::get().enqueue(_im2col_kernel, false);
+ _flatten_layer.run();
}
// Run matrix multiply
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 1d1b17bbf1..a8d7058f2a 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -171,6 +171,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
// Check if we need to reshape the matrix B only on the first run
const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -180,7 +181,7 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
- TensorInfo tmp_output_info = *output->clone();
+ TensorInfo tmp_output_info{};
// Get the GPU target
const GPUTarget gpu_target = CLScheduler::get().target();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index fb90415e31..49549a0ad0 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -171,7 +171,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
const DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
const unsigned int kernel_width = weights->info()->dimension(idx_width);
@@ -193,7 +192,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
ICLTensor *gemm_output_to_use = output;
ICLTensor *gemm_output_staged_to_use = output;
- const unsigned bias_element = (_append_bias && !_skip_im2col) ? 1 : 0;
const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
// Get parameters from conv_info
@@ -212,7 +210,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
dilation);
unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element;
// _weights_reshaped will be auto configured in the kernel.
// Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
@@ -223,25 +220,13 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
// Create tensor to store im2col reshaped inputs
if(!_skip_im2col)
{
- // Calculate im2col shape
- // For OpenCL the batch size is on the third dimension
- // TODO (giaiod01): Use auto-init COMPMID-1277
- TensorShape shape_im2col = input->info()->tensor_shape();
- if(shape_im2col.num_dimensions() >= 3)
- {
- shape_im2col.remove_dimension(2);
- }
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
-
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo im2col_reshaped_info(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
- _im2col_output.allocator()->init(im2col_reshaped_info);
_memory_group.manage(&_im2col_output);
- // Configure and tune im2col
+ // Configure and tune im2col. im2col output shape is auto-initialized
_im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
+
+ // Set quantization info
+ _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
CLScheduler::get().tune_kernel_static(_im2col_kernel);
// Update GEMM input
@@ -350,11 +335,10 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
const ITensorInfo *gemm_output_staged_to_use = output;
const ITensorInfo *weights_to_use = weights;
- const bool is_nhwc = data_layout == DataLayout::NHWC;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
- const bool append_bias = (biases != nullptr) && (!is_quantized);
- const unsigned bias_element = (append_bias && !skip_im2col) ? 1 : 0;
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
+ const bool append_bias = (biases != nullptr) && (!is_quantized);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -391,7 +375,6 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
dilation);
unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element;
// Output tensor auto inizialitation if not yet initialized
ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
@@ -400,19 +383,14 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
if(!skip_im2col)
{
- // Create tensor info for im2col reshaped inputs
- // For OpenCL the batch size is on the third dimension
- // TODO (giaiod01): Use auto-init COMPMID-1277
- TensorShape shape_im2col = input->tensor_shape();
- if(input->tensor_shape().num_dimensions() >= 3)
- {
- shape_im2col.remove_dimension(2);
- }
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
- im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+ const Size2D kernel_dims(kernel_width, kernel_height);
+
+ // Output tensor auto initialization if not yet initialized
+ TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true);
+
+ auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation));
gemm_input_to_use = &im2col_reshaped_info;
}
else if(append_bias)
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 25b8adc431..c2f0283d4e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -113,7 +113,7 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
- TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
+ TensorShape shape_im2col = compute_flatten_shape(input->info());
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
// Configure im2col kernel
@@ -249,7 +249,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
bool is_fc_after_conv = true;
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
+ const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -420,4 +420,4 @@ void NEFullyConnectedLayer::prepare()
_is_prepared = true;
}
-}
+} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index c0a5d0a436..df4a040bad 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -223,7 +223,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
{
// Calculate im2col shape
// For NEON the batch size is on the fourth dimension
- // TODO (giaiod01): Use auto-init COMPMID-1277
+ // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482
TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_weights_rows);
shape_im2col.set(1, conv_w * conv_h);
@@ -232,7 +232,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_memory_group.manage(&_im2col_output);
- // Configure and tune im2col
+ // Configure
_im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
// Update GEMM input