aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2018-06-28 16:29:29 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit215b4ea6c9dee480a22070d5873b0b8cb52531a0 (patch)
tree398e552c4d01c0b84d03a873098a9183ba8f82e4
parentad486e21e5870f41774f30825c270762e08ae71e (diff)
downloadComputeLibrary-215b4ea6c9dee480a22070d5873b0b8cb52531a0.tar.gz
COMPMID-1277 - Optimizing CLIm2ColKernel for NHWC.
This patch includes: - Im2Col optimizations for NHWC using a new data layout - Refactoring of CLIm2ColKernel adding validation method and auto-init - Removed im2col_reduced from CLIm2ColKernel and created a new kernel CLFlattenLayerKernel Change-Id: I1620640b6796baa268324b33ae92cdd8de53e27c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141241 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
-rw-r--r--arm_compute/core/CL/CLKernels.h1
-rw-r--r--arm_compute/core/CL/kernels/CLFlattenLayerKernel.h74
-rw-r--r--arm_compute/core/CL/kernels/CLIm2ColKernel.h41
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h14
-rw-r--r--arm_compute/runtime/CL/functions/CLFlattenLayer.h17
-rw-r--r--arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h6
-rw-r--r--src/core/CL/CLKernelLibrary.cpp21
-rw-r--r--src/core/CL/cl_kernels/convolution_layer.cl74
-rw-r--r--src/core/CL/cl_kernels/flatten.cl57
-rw-r--r--src/core/CL/cl_kernels/im2col.cl529
-rw-r--r--src/core/CL/kernels/CLFlattenLayerKernel.cpp151
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp474
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp31
-rw-r--r--src/runtime/CL/functions/CLFlattenLayer.cpp12
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp34
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp3
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp54
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp6
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp4
-rw-r--r--tests/datasets/ShapeDatasets.h6
-rw-r--r--tests/validation/CL/Im2Col.cpp54
-rw-r--r--tests/validation/CL/LocallyConnected.cpp17
-rw-r--r--tests/validation/NEON/Im2Col.cpp30
-rw-r--r--tests/validation/fixtures/FlattenLayerFixture.h2
-rw-r--r--tests/validation/fixtures/Im2ColFixture.h11
-rw-r--r--tests/validation/reference/Im2Col.cpp109
-rw-r--r--tests/validation/reference/Im2Col.h2
28 files changed, 1027 insertions, 813 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 737d8df4e8..ea942bde5d 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -62,6 +62,7 @@
#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
new file mode 100644
index 0000000000..a5cf6e0c33
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLFLATTENLAYERKERNEL_H__
+#define __ARM_COMPUTE_CLFLATTENLAYERKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL interface for the flatten kernel.*/
+class CLFlattenLayerKernel : public ICLKernel
+{
+public:
+ /** Default constructor */
+ CLFlattenLayerKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFlattenLayerKernel(const CLFlattenLayerKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLFlattenLayerKernel &operator=(const CLFlattenLayerKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLFlattenLayerKernel(CLFlattenLayerKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLFlattenLayerKernel &operator=(CLFlattenLayerKernel &&) = default;
+ /** Set the input and output of the kernel.
+ *
+ * @param[in] input First input tensor to flatten with at least 3 dimensions.
+ * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+ * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+ */
+ void configure(const ICLTensor *input, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLFlattenLayerKernel
+ *
+ * @param[in] input First input tensor to flatten with at least 3 dimensions.
+ * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+ * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+public:
+ const ICLTensor *_input;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLFLATTENLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index fc930abcbe..ae19319047 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -96,48 +96,13 @@ public:
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
-private:
- /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- * @param[in,out] queue Command queue on which to enqueue the kernel.
- */
- void run_reduced(const Window &window, cl::CommandQueue &queue);
- /** run the generic convolution layer input reshape kernel
- *
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- * @param[in,out] queue Command queue on which to enqueue the kernel.
- */
- void run_generic(const Window &window, cl::CommandQueue &queue);
-
- /** Chooses and configure the right kernel for the given input arguments.
- *
- * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
- * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input,
- * while every dimension above represents a batch. Data types supported: Same as @p input
- * @param[in] kernel_dims The kernel dimensions (width and height).
- * @param[in] dilation Dilation, in elements, across x and y. Defaults to (1, 1).
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- * @param[in] has_bias In case biases are provided expands the matrix with 1.
- * @param[out] build_opts OpenCL buil program options.
- *
- * @return the name of the kernel chosen
- */
- std::string configure_window(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims,
- const Size2D &dilation, const PadStrideInfo &conv_info, CLBuildOptions &build_opts);
-
- /** Common signature for the kernel to run */
- using Im2ColFunction = void (CLIm2ColKernel::*)(const Window &, cl::CommandQueue &);
-
public:
const ICLTensor *_input;
ICLTensor *_output;
- PadStrideInfo _conv_info;
std::pair<unsigned int, unsigned int> _convolved_dims;
- unsigned int _num_elems_processed_per_iteration;
- Im2ColFunction _run_func;
- Size2D _kernel_dims;
+ unsigned int _num_elems_processed_per_iteration;
+ Size2D _kernel_dims;
+ PadStrideInfo _conv_info;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 1e5b9afd0e..0a2a535502 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -215,19 +215,13 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
return output_shape;
}
-inline TensorShape compute_im2col_fc_shape(const ITensorInfo *input, const int num_input_dimensions = 3)
+inline TensorShape compute_flatten_shape(const ITensorInfo *input)
{
- TensorShape output_shape{ input->tensor_shape() };
-
- output_shape.collapse(num_input_dimensions);
+ // The output shape will be the flatten version of the input (i.e. [ width * height * channels, num_batches, ... ] ). Used for FlattenLayer and FullyConnectedLayer.
- return output_shape;
-}
-inline TensorShape compute_im2col_flatten_shape(const ITensorInfo *input)
-{
- // The output shape will be the flatten version of the input (i.e. [ width * height * channels, 1, 1, ... ] ). Used for FlattenLayer.
TensorShape output_shape{ input->tensor_shape() };
- output_shape.collapse(3, 0);
+
+ output_shape.collapse(3);
return output_shape;
}
diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
index 88df4a7f96..ebc0e5e53f 100644
--- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
@@ -33,7 +33,7 @@ class ICLTensor;
/** Basic function to execute flatten. This function calls the following OpenCL kernel:
*
-* -# @ref CLIm2ColKernel
+* -# @ref CLFlattenLayerKernel
*
*/
class CLFlattenLayer : public ICLSimpleFunction
@@ -41,11 +41,22 @@ class CLFlattenLayer : public ICLSimpleFunction
public:
/** Initialise the kernel's input and output.
*
- * @param[in] input First input tensor to flatten with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: F16/F32
+ * @param[in] input First input tensor to flatten with at least 3 dimensions.
+ * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
* @param[out] output Output tensor with shape [w*h*d, input_batches] where:
- * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+ * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
*/
void configure(const ICLTensor *input, ICLTensor *output);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLTranspose
+ *
+ * @param[in] input First input tensor to flatten with at least 3 dimensions.
+ * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
+ * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output);
};
} // namespace arm_compute
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index e8fe8e47a2..450cd831ee 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -27,11 +27,11 @@
#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
#include "arm_compute/runtime/CL/functions/CLGEMM.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
@@ -130,14 +130,14 @@ private:
void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
CLMemoryGroup _memory_group;
- CLIm2ColKernel _im2col_kernel;
CLConvertFullyConnectedWeights _convert_weights;
+ CLFlattenLayer _flatten_layer;
CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
CLGEMM _mm_gemm;
CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
- CLTensor _im2col_output;
+ CLTensor _flatten_output;
CLTensor _gemmlowp_output;
CLTensor _converted_weights_output;
CLTensor _reshape_weights_output;
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 64519ff459..29b01e6cea 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -226,6 +226,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
{ "erode", "erode.cl" },
{ "fast_corners", "fast_corners.cl" },
+ { "flatten", "flatten.cl" },
{ "fill_image_borders_constant", "fill_border.cl" },
{ "fill_image_borders_replicate", "fill_border.cl" },
{ "finalize", "optical_flow_pyramid_lk.cl" },
@@ -270,13 +271,12 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "hog_detector", "hog.cl" },
{ "hog_orientation_binning", "hog.cl" },
{ "hysteresis", "canny.cl" },
- { "im2col1x1_stridex1_dchw", "im2col.cl" },
- { "im2col3x3_dchw", "im2col.cl" },
- { "im2col5x5_dchw", "im2col.cl" },
- { "im2col11x11_padx0_pady0_dchw", "im2col.cl" },
- { "im2col_generic_dchw", "im2col.cl" },
- { "im2col_generic_padx0_pady0_dchw", "im2col.cl" },
- { "im2col_reduced_dchw", "im2col.cl" },
+ { "im2col1x1_stridex1_nchw", "im2col.cl" },
+ { "im2col3x3_nchw", "im2col.cl" },
+ { "im2col5x5_nchw", "im2col.cl" },
+ { "im2col11x11_padx0_pady0_nchw", "im2col.cl" },
+ { "im2col_generic_nchw", "im2col.cl" },
+ { "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
{ "im2col3x3_nhwc", "im2col.cl" },
{ "im2col_generic_nhwc", "im2col.cl" },
{ "init_level", "optical_flow_pyramid_lk.cl" },
@@ -333,8 +333,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "remap_nearest_neighbour", "remap.cl" },
{ "remap_bilinear", "remap.cl" },
{ "reshape_layer", "reshape_layer.cl" },
- { "reshape_to_columns_nchw", "convolution_layer.cl" },
- { "reshape_to_columns_nhwc", "convolution_layer.cl" },
+ { "reshape_to_columns", "convolution_layer.cl" },
{ "RGB888_to_IYUV_bt709", "color_convert.cl" },
{ "RGB888_to_NV12_bt709", "color_convert.cl" },
{ "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
@@ -572,6 +571,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
#include "./cl_kernels/fast_corners.clembed"
},
{
+ "flatten.cl",
+#include "./cl_kernels/flatten.clembed"
+ },
+ {
"fill_border.cl",
#include "./cl_kernels/fill_border.clembed"
},
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 9335b047fe..2b75b45fe1 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl
@@ -53,7 +53,7 @@
* @param[in] total_filters Total number of filters. 4th dimension of the weights matrix
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
*/
-__kernel void reshape_to_columns_nchw(
+__kernel void reshape_to_columns(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
#ifdef HAS_BIAS
@@ -109,74 +109,4 @@ __kernel void reshape_to_columns_nchw(
}
}
}
-
-/** This kernel reshapes the tensor's low three dimensions to single column
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] bias_ptr Pointer to the bias tensor. Same as @p src_ptr
- * @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes)
- * @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] depth The depth of the input tensor
- * @param[in] width The width of the input tensor
- * @param[in] height The height of the input tensor
- * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix
- */
-__kernel void reshape_to_columns_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
-#ifdef HAS_BIAS
- VECTOR_DECLARATION(bias),
-#endif /* HAS_BIAS */
- uint depth, uint width, uint height, uint total_filters, uint dst_stride_z)
-{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- bool is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
-
- __global uchar *tmp_src_ptr = src.ptr;
- __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * width * dst_stride_y + get_global_id(
- 0) * width * height * dst_stride_y;
-#ifdef HAS_BIAS
- __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
-#endif /* HAS_BIAS */
-
- if(is_last_thread)
- {
- for(uint i = 0; i < total_filters; ++i)
- {
- *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
-
-#ifdef HAS_BIAS
- *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
- tmp_bias_ptr += bias_stride_x;
-#endif /* HAS_BIAS */
- tmp_src_ptr += height * src_stride_z;
- tmp_dst_ptr += dst_stride_x;
- }
- }
- else
- {
- for(uint i = 0; i < total_filters; ++i)
- {
- *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr);
- tmp_src_ptr += height * src_stride_z;
- tmp_dst_ptr += dst_stride_x;
- }
- }
-}
-#endif // defined(DATA_TYPE) && defined(NUM_GROUPS) \ No newline at end of file
+#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl
new file mode 100644
index 0000000000..df0f9c4886
--- /dev/null
+++ b/src/core/CL/cl_kernels/flatten.cl
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+
+/** This opencl kernel flattens the first 3 dimensions of the input tensor
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void flatten(
+ TENSOR3D_DECLARATION(src),
+ VECTOR_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (int)SRC_WIDTH + get_global_id(2) * (int)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(
+ DATA_TYPE);
+
+ *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index d034b30b68..274ec20046 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -35,13 +35,12 @@
#error "Element size not support"
#endif // ELEMENT_SIZE
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
*
- * @note This kernel computes 4 elements
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
@@ -62,16 +61,16 @@
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col1x1_stridex1_dchw(
+__kernel void im2col1x1_stridex1_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
- const uint yc = get_global_id(1); // y coordinate in the convolved tensor
- const uint ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
+ const uint yc = get_global_id(1); // y coordinate in the convolved tensor
+ const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
// Clamp xc
// The strategy clamps at "xc" as it will be a valid value for sure
@@ -107,7 +106,7 @@ __kernel void im2col1x1_stridex1_dchw(
*(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
*((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
@@ -116,18 +115,16 @@ __kernel void im2col1x1_stridex1_dchw(
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#if defined(DILATION_X) && defined(DILATION_Y)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
@@ -151,183 +148,65 @@ __kernel void im2col1x1_stridex1_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_generic_nhwc(
+__kernel void im2col_generic_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int src_stride_y_int = (int)src_stride_y;
- const int src_stride_z_int = (int)src_stride_z;
- const int xc = get_global_id(1); // x coordinate in the convolved tensor
- const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
- const int ch = get_global_id(0); // input feature map
- const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X - PAD_LEFT;
const int yi = yc * STRIDE_Y - PAD_TOP;
// Calculate output indices
- const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH;
+ const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+ __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+ // Linearize convolution elements
for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
{
- const int dilated_offset_y = yk * DILATION_Y;
- const int y0 = yi + dilated_offset_y;
- if(y0 >= 0 && y0 < SRC_HEIGHT)
+ int y = yi + yk * DILATION_Y;
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
{
- int xk;
- for(xk = 0; xk < KERNEL_WIDTH; xk++)
+ int x = xi + xk * DILATION_X;
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+ if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
{
- const int dilated_offset_x = xk * DILATION_X;
- const int x0 = xi + dilated_offset_x;
- if(x0 >= 0 && x0 < SRC_WIDTH)
- {
- *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + dilated_offset_x * src_stride_y + dilated_offset_y * src_stride_z, DATA_TYPE);
- }
- else
- {
- *((__global DATA_TYPE *)output_ptr) = PAD_VALUE;
- }
- output_ptr += 1 * sizeof(DATA_TYPE);
+ *output_ptr = PAD_VALUE;
}
- }
- else
- {
- for(int xk = 0; xk < KERNEL_WIDTH; xk++)
+ else
{
- *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE;
- output_ptr += 1 * dst_stride_x;
+ *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
}
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
}
}
-#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
- {
- *((__global DATA_TYPE *)output_ptr) = 1.0f;
- output_ptr += 1 * dst_stride_x;
- }
-#endif // HAS_BIAS
-}
-
-/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- uint src_stride_w,
- uint dst_stride_w)
-{
- const int src_stride_y_int = (int)src_stride_y;
- const int src_stride_z_int = (int)src_stride_z;
- const int xc = get_global_id(1); // x coordinate in the convolved tensor
- const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor
- const int ch = get_global_id(0); // input feature map
- const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size
-
- // Calculate input indices
- const int xi = xc * STRIDE_X - PAD_LEFT;
- const int yi = yc * STRIDE_Y - PAD_TOP;
-
- // Calculate output indices
- const int xo = ch * 9; // 3x3
- const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-
- VEC_DATA_TYPE(DATA_TYPE, 3)
- row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
- VEC_DATA_TYPE(DATA_TYPE, 3)
- row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
- VEC_DATA_TYPE(DATA_TYPE, 3)
- row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE);
-
- const int3 y = (int3)yi + (int3)(0, 1, 2);
- // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer.
- if(y.s0 >= 0 && y.s0 < SRC_HEIGHT)
- {
- row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
- PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE));
- }
-
- if(y.s1 >= 0 && y.s1 < SRC_HEIGHT)
- {
- row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
- PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE));
- }
-
- if(y.s2 >= 0 && y.s2 < SRC_HEIGHT)
- {
- row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(
- PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE),
- PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE));
- }
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
- // Put 0 if the value is out-of-bound
- const int3 x = (int3)xi + (int3)(0, 1, 2);
- VEC_DATA_TYPE(COND_DATA_TYPE, 3)
- cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
- row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
- row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0);
- row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
- vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
- *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
- *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+ *output_ptr = 1.0f;
}
#endif // HAS_BIAS
}
+#endif // defined(DILATION_X) && defined(DILATION_Y)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3
+/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
@@ -350,16 +229,16 @@ __kernel void im2col3x3_nhwc(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col3x3_dchw(
+__kernel void im2col3x3_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X - PAD_LEFT;
@@ -402,19 +281,19 @@ __kernel void im2col3x3_dchw(
*((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
}
#endif // HAS_BIAS
}
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5
+/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
* @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
@@ -437,16 +316,16 @@ __kernel void im2col3x3_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col5x5_dchw(
+__kernel void im2col5x5_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X - PAD_LEFT;
@@ -576,20 +455,20 @@ __kernel void im2col5x5_dchw(
}
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)output_ptr) = 1.0f;
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
* @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
@@ -610,16 +489,16 @@ __kernel void im2col5x5_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col11x11_padx0_pady0_dchw(
+__kernel void im2col11x11_padx0_pady0_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X;
@@ -776,21 +655,21 @@ __kernel void im2col11x11_padx0_pady0_dchw(
}
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*((__global DATA_TYPE *)output_ptr) = 1.0f;
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH)
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
- * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
*
* @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
* @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
* @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
@@ -810,16 +689,16 @@ __kernel void im2col11x11_padx0_pady0_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_generic_padx0_pady0_dchw(
+__kernel void im2col_generic_padx0_pady0_nchw(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int xc = get_global_id(0); // x coordinate in the convolved tensor
+ const int yc = get_global_id(1); // y coordinate in the convolved tensor
+ const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
+ const int batch = get_global_id(2) / SRC_DEPTH; // batch size
// Calculate input indices
const int xi = xc * STRIDE_X;
@@ -855,25 +734,25 @@ __kernel void im2col_generic_padx0_pady0_dchw(
} /* End of loop over KERNEL_HEIGHT */
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if(ch == (SRC_DEPTH - 1))
{
*output_ptr = 1.0f;
}
#endif // HAS_BIAS
}
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
*
+ * @note This kernel computes VECTOR_SIZE elements
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
* @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
* @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
@@ -893,64 +772,154 @@ __kernel void im2col_generic_padx0_pady0_dchw(
* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_generic_dchw(
+__kernel void im2col3x3_nhwc(
TENSOR3D_DECLARATION(src),
IMAGE_DECLARATION(dst),
uint src_stride_w,
uint dst_stride_w)
{
- const int xc = get_global_id(0); // x coordinate in the convolved tensor
- const int yc = get_global_id(1); // y coordinate in the convolved tensor
- const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map
- const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+ const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
// Calculate input indices
- const int xi = xc * STRIDE_X - PAD_LEFT;
- const int yi = yc * STRIDE_Y - PAD_TOP;
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
- // Calculate output indices
- const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
- const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
- __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-
- // Linearize convolution elements
- for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
- {
- int y = yi + yk * DILATION_Y;
- for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
- {
- int x = xi + xk * DILATION_X;
-#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
- {
- *output_ptr = PAD_VALUE;
- }
- else
- {
- *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
- }
-#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
- }
- }
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int3 offset = 0;
+
+ // Clamp xi
+ int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+ xi_offset *= (int3)src_stride_y;
+
+ // Out-of-bound condition for X
+ int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+ // yi == 0
+ // Clamp yi
+ // yi_coord is casted to unsigned int in order to use just a min() operation
+ // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+ yi_coord = yi - (int)PAD_TOP;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 1
+ // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+ yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with zeros
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 1) >= (uint)(SRC_HEIGHT));
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 2
+ // Clamp yi_coord
+ yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 2) >= (uint)(SRC_HEIGHT));
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // Store
+ VSTORE(VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH);
+ VSTORE(VECTOR_SIZE)
+ (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH);
#ifdef HAS_BIAS
- if(ch == (KERNEL_DEPTH - 1))
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
{
- *output_ptr = 1.0f;
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
}
#endif // HAS_BIAS
}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
- * the kernel width and height are the same of width and height of the input tensor
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
*
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -958,35 +927,75 @@ __kernel void im2col_generic_dchw(
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width The width of the input tensor
- * @param[in] height The height of the input tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
*/
-__kernel void im2col_reduced_dchw(
+__kernel void im2col_generic_nhwc(
TENSOR3D_DECLARATION(src),
- VECTOR_DECLARATION(dst),
- uint width, uint height)
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
- const uint image_size = width * height;
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int i = 0;
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+ {
+ // Clamp yi_coord
+ int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+ yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
- __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x;
+ // Out-of-bound condition for Y
+ int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
- *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr);
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+ {
+ // Clamp xi_coord
+ int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+ xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+ // Out-of-bound condition for X
+ int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+ // Replace with PAD_VALUE if the value is out-of-bound
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition));
+
+ // Store
+ VSTORE(VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+
+ i++;
+ }
+ }
#ifdef HAS_BIAS
- // If it is the last thread in the 3 dimensional workgroup
- if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1))
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
{
- tmp_out_ptr += dst_stride_x;
- *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f;
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
}
#endif // HAS_BIAS
}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
new file mode 100644
index 0000000000..0b5feffcc9
--- /dev/null
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
+ DataType::U32, DataType::S32,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+ // Checks performed when output is configured
+ if(output->total_size() != 0)
+ {
+ const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_flatten_shape(input));
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input)));
+
+ Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+ return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLFlattenLayerKernel::CLFlattenLayerKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLFlattenLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("flatten", build_opts.options()));
+
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "flatten";
+ _config_id += "_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void CLFlattenLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+ Window out_slice = out_window.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_3D();
+
+ // Run kernel
+ do
+ {
+ // Set arguments
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, _lws_hint);
+ }
+ while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index a09129bba6..39654e2190 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -31,7 +31,6 @@
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -40,12 +39,22 @@
#include <cmath>
#include <tuple>
+#include <utility>
using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
+struct Im2ColConfiguration
+{
+ std::string kernel_name{};
+ std::set<std::string> build_options{};
+ unsigned int num_elems_processed_per_iteration{};
+ bool is_padding_required_nchw{};
+};
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -54,263 +63,255 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, b
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- // Checks performed when output is configured
- if(output->total_size() != 0)
+ if(output->total_size() > 0)
{
+ const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
return Status{};
}
-inline bool run_im2col_reduced(ITensorInfo *input, ITensorInfo *output, const PadStrideInfo &conv_info)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
+ unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw)
{
- int stride_x = 0;
- int stride_y = 0;
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- std::tie(stride_x, stride_y) = conv_info.stride();
+ // Output tensor auto initialization if not yet initialized
+ TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true);
- return (output->dimension(0) == (input->dimension(0) * input->dimension(1) * input->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
- && (std::equal(input->tensor_shape().cbegin() + 3,
- input->tensor_shape().cend(),
- output->tensor_shape().cbegin() + 1))
- && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
-}
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape));
-} // namespace
+ const DataLayout data_layout = input->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = input->dimension(width_idx);
+ const unsigned int input_height = input->dimension(height_idx);
-CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _conv_info(), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr), _kernel_dims()
-{
-}
+ // Configure the execute window based on the selected optimal OpenCL kernel
+ bool window_changed = false;
+ Window win;
-std::string
-CLIm2ColKernel::configure_window(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims,
- const Size2D &dilation, const PadStrideInfo &conv_info, CLBuildOptions &build_opts)
-{
- std::string kernel_name;
- bool is_optimized_path = false;
- const bool reduced = run_im2col_reduced(input->info(), output->info(), conv_info);
- const DataType data_type = input->info()->data_type();
- const bool squared_im2col = kernel_dims.width == kernel_dims.height;
- const DataLayout data_layout = input->info()->data_layout();
+ if(data_layout == DataLayout::NHWC)
+ {
+ win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const unsigned int input_width = input->info()->dimension(width_idx);
- const unsigned int input_height = input->info()->dimension(height_idx);
- const unsigned int input_channel = input->info()->dimension(channel_idx);
+ const int xin_start = 0;
+ const int xin_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration) : input->dimension(0);
+ const int yin_start = 0;
+ const int yin_end = input->dimension(1);
- if(!reduced)
+ const int xout_start = 0;
+ const int xout_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration) : output->dimension(0);
+ const int yout_start = 0;
+ const int yout_end = output->dimension(1);
+
+ AccessWindowStatic input_access(input, xin_start, yin_start, xin_end, yin_end);
+ AccessWindowStatic output_access(output, xout_start, yout_start, xout_end, yout_end);
+ window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
+ }
+ else
{
- // Default kernel name
- if(data_layout == DataLayout::NCHW)
+ if(is_padding_required_nchw)
{
- kernel_name = "im2col_generic_dchw";
+ const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
+ win = calculate_max_window(*input,
+ Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+ AccessWindowStatic input_access(input,
+ -border.left,
+ -border.top,
+ ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+ input_height + border.bottom);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
}
else
{
- kernel_name = "im2col_generic_nhwc";
+ // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
+ // update_window_and_padding() can be skipped
+ win = calculate_max_window(*input, Steps());
}
+ }
+
+ output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+ // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+ win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
- _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
-
- build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
- build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
- build_opts.add_option("-DKERNEL_DEPTH=" + support::cpp11::to_string(input_channel));
- build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first));
- build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
- build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
- build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
- build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
- build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
- build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
- build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
- build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
- build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
- build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
+ Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+ return std::make_pair(err, win);
+}
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
+{
+ const DataLayout data_layout = input->data_layout();
+ const DataType data_type = input->data_type();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int input_width = input->dimension(width_idx);
+ const unsigned int input_height = input->dimension(height_idx);
+ const unsigned int input_channel = input->dimension(channel_idx);
+
+ const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+
+ // Im2Col configuration
+ std::string kernel_name = "im2col_generic_";
+ CLBuildOptions build_opts;
+ unsigned int num_elems_processed_per_iteration = 1;
+ bool is_padding_required_nchw = false;
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size()));
+ build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+ build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+ build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
+ build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+ build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+ build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+ build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+ build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+ build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
+ build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+ build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+ build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->quantization_info().offset), "-DPAD_VALUE=0");
+ build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+
+ if(data_layout == DataLayout::NHWC)
+ {
+ num_elems_processed_per_iteration = 2;
+ is_padding_required_nchw = false;
+
+ // Only the 3x3 case is optimized for NHWC
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ kernel_name = "im2col3x3_";
+ }
+
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+ build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));
+ }
+ else
+ {
if(dilation == Size2D(1U, 1U))
{
+ const bool squared_im2col = kernel_dims.width == kernel_dims.height;
if(squared_im2col)
{
- // Check if we can run an optimized im2col
+ // Check if we can run an optimized im2col for NCHW
switch(kernel_dims.width)
{
case 1:
// Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
- if(conv_info.stride().first == 1 && !conv_info.has_padding() && data_layout == DataLayout::NCHW)
+ if(conv_info.stride().first == 1 && !conv_info.has_padding())
{
- // Set hint for LWS
- _lws_hint = cl::NDRange(1, 1, 8);
- _num_elems_processed_per_iteration = 4;
- is_optimized_path = true;
- kernel_name = "im2col1x1_stridex1_dchw";
+ kernel_name = "im2col1x1_stridex1_";
+ num_elems_processed_per_iteration = 4;
+ is_padding_required_nchw = true;
}
break;
case 3:
- _lws_hint = cl::NDRange(1, 1, 8);
- _num_elems_processed_per_iteration = 1;
- is_optimized_path = true;
- switch(data_layout)
- {
- case DataLayout::NCHW:
- kernel_name = "im2col3x3_dchw";
- break;
- case DataLayout::NHWC:
- kernel_name = "im2col3x3_nhwc";
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported.");
- break;
- }
+ kernel_name = "im2col3x3_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
break;
case 5:
- _num_elems_processed_per_iteration = 1;
- switch(data_layout)
- {
- case DataLayout::NCHW:
- is_optimized_path = true;
- kernel_name = "im2col5x5_dchw";
- break;
- default:
- // using generic_nhwc
- is_optimized_path = false;
- break;
- }
+ kernel_name = "im2col5x5_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
break;
case 11:
- _num_elems_processed_per_iteration = 1;
// Optimized im2col11x11 if pad_x = pad_y = 0
- if(!conv_info.has_padding() && data_layout == DataLayout::NCHW)
+ if(!conv_info.has_padding())
{
- is_optimized_path = true;
- kernel_name = "im2col11x11_padx0_pady0_dchw";
+ kernel_name = "im2col11x11_padx0_pady0_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = true;
}
break;
default:
- is_optimized_path = false;
+ kernel_name = "im2col_generic_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = false;
break;
}
}
else if(kernel_dims.width > 1 && !conv_info.has_padding())
{
- _num_elems_processed_per_iteration = 1;
- is_optimized_path = false;
-
- if(data_layout == DataLayout::NCHW)
- {
- kernel_name = "im2col_generic_padx0_pady0_dchw";
-
- // Optimized im2col is performed using one or more vector operations with the specified vector size
- // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
- // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
- // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
- // Using the vector size of 8, however, may be faster.
- size_t vector_size = 4;
- // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
- // is used instead.)
- if(kernel_dims.width < vector_size)
- {
- vector_size = kernel_dims.width;
- }
- // Vector size optimized for the 11x11 AlexNet convolution on Bifrost.
- const GPUTarget gpu_target = get_target();
- if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && kernel_dims.width == 11)
- {
- vector_size = 8;
- }
- const size_t width_mod_vector_size = kernel_dims.width % vector_size;
- build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
- build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
- }
+ kernel_name = "im2col_generic_padx0_pady0_";
+ num_elems_processed_per_iteration = 1;
+ is_padding_required_nchw = false;
+
+ // Optimized im2col is performed using one or more vector operations with the specified vector size
+ // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+ // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+ // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+ // Using the vector size of 8, however, may be faster.
+ // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+ // is used instead.)
+ const size_t vector_size = std::min(static_cast<size_t>(4), kernel_dims.width);
+ const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+ build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+ build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
}
}
- _run_func = &CLIm2ColKernel::run_generic;
- }
- else
- {
- _num_elems_processed_per_iteration = 1;
- kernel_name = "im2col_reduced_dchw";
- _run_func = &CLIm2ColKernel::run_reduced;
- }
- // Configure kernel window
- Window win;
- if(is_optimized_path)
- {
- if(data_layout == DataLayout::NHWC)
- {
- win = calculate_max_window(*input->info(),
- Steps(_num_elems_processed_per_iteration),
- false,
- BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()));
- const int x = -conv_info.pad_left();
- const int y = -conv_info.pad_top();
- const int h = kernel_dims.width * _num_elems_processed_per_iteration;
- const int w = 1;
- AccessWindowRectangle input_access(input->info(), x, y, w, h);
- update_window_and_padding(win, input_access);
- }
- else
- {
- const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
- win = calculate_max_window(*input->info(),
- Steps(_num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
- AccessWindowStatic input_access(input->info(),
- -border.left,
- -border.top,
- ceil_to_multiple(input_width + border.right, kernel_dims.width * _num_elems_processed_per_iteration),
- input_height + border.bottom);
- update_window_and_padding(win, input_access);
- }
- }
- else
- {
- // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
- // update_window_and_padding() can be skipped
- win = calculate_max_window(*input->info(), Steps());
}
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- if(!reduced)
- {
- // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
- win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
- }
- ICLKernel::configure(win);
- return kernel_name;
+ // Append the data layout to the kernel_name
+ kernel_name += lower_string(string_from_data_layout(data_layout));
+
+ Im2ColConfiguration im2col_config;
+ im2col_config.kernel_name = kernel_name;
+ im2col_config.build_options = build_opts.options();
+ im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+ im2col_config.is_padding_required_nchw = is_padding_required_nchw;
+
+ return im2col_config;
+}
+} // namespace
+
+CLIm2ColKernel::CLIm2ColKernel()
+ : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info()
+{
}
void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation));
- _input = input;
- _output = output;
- _kernel_dims = kernel_dims;
- _conv_info = conv_info;
+ const DataLayout data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int input_width = input->info()->dimension(width_idx);
+ const unsigned int input_height = input->info()->dimension(height_idx);
- const DataType data_type = input->info()->data_type();
+ // Select and configure the optimal OpenCL kernel to run.
+ // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
+ // and the padding requirement flag
+ Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation);
// Create kernel
- CLBuildOptions build_opts;
- build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
- build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
- build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(im2col_config.kernel_name, im2col_config.build_options));
- _num_elems_processed_per_iteration = 1;
+ _input = input;
+ _output = output;
+ _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+ _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
+ _kernel_dims = kernel_dims; // Only needed by the Tuner
+ _conv_info = conv_info; // Only needed by the Tuner
- const std::string kernel_name = configure_window(input, output, kernel_dims, dilation, conv_info, build_opts);
- // Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ // Configure kernel window
+ auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+ im2col_config.is_padding_required_nchw);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ ICLKernel::configure(win_config.second);
// Set config_id for enabling LWS tuning
- _config_id = kernel_name;
+ _config_id = im2col_config.kernel_name;
_config_id += "_";
_config_id += lower_string(string_from_data_type(input->info()->data_type()));
_config_id += "_";
@@ -323,31 +324,24 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
{
- ARM_COMPUTE_UNUSED(kernel_dims);
- ARM_COMPUTE_UNUSED(conv_info);
- ARM_COMPUTE_UNUSED(has_bias);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation));
+ Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+ im2col_config.is_padding_required_nchw)
+ .first);
return Status{};
}
void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
{
- ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
- (this->*_run_func)(window, queue);
-}
-
-void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
-{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const DataLayout data_layout = _input->info()->data_layout();
// Get initial windows
+ // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- // Change the Z dimension's step back to 1
window_collapsed.set_dimension_step(Window::DimZ, 1);
Window window_output;
@@ -359,36 +353,32 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
Window slice_in = first_slice_3d;
Window slice_out = window_output.first_slice_window_2D();
- const bool out_dim_not_same_input_dim = _convolved_dims.first != _input->info()->dimension(width_idx) || _convolved_dims.second != _input->info()->dimension(height_idx);
+ if(data_layout == DataLayout::NHWC)
+ {
+ const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
+ const int num_batches = tmp_win[3].end();
- // Setup slice if convolved dims are not the same as input dims
- if(out_dim_not_same_input_dim)
+ slice.set(1, Window::Dimension(0, static_cast<int>(_output->info()->tensor_shape()[1]), 1));
+ slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
+ }
+ else
{
- // If the stride_x or stride_y are not 1, the output tensor of matrix multiply (Convolved tensor) will not
- // have the same shape of the im2col input tensor
- // In this case we need to re-compute the window using the shape of the tensor after matrix multiply (convolved_dims)
- slice.set(width_idx, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
- if(data_layout == DataLayout::NHWC)
- {
- // if layout is NHWC, we need to multiply convolved_dims.height by the number of batches as for this
- // format we collapsed HEIGHT and all subsequent dimensions (batches) together. This is necessary to ensure
- // global_id(2) values are in the correct range.
- const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
- const int num_batches = tmp_win[3].end();
- slice.set(height_idx, Window::Dimension(0, static_cast<int>(_convolved_dims.second) * num_batches, 1));
- }
- else
- {
- slice.set(height_idx, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
- }
+ slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
+ slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+ // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
}
// Setup input slice
- // The first three dimensions of the input are increased by the inner loops
+ // The dimensions of the input are increased within the OpenCL kernel
slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ // Setup output slice
+ // The dimensions of the output are increased within the OpenCL kernel
+ slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
do
{
unsigned int idx = 0;
@@ -399,30 +389,4 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
enqueue(queue, *this, slice, _lws_hint);
}
while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-
-void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
- Window out_window;
- out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
- Window out_slice = out_window.first_slice_window_1D();
- Window in_slice = window.first_slice_window_3D();
-
- // Run kernel
- do
- {
- // Set arguments
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _output, out_slice);
-
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
- _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
- enqueue(queue, *this, in_slice, _lws_hint);
- }
- while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
-}
+} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 9df91fccde..58ecd9ccb3 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -87,8 +87,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
(biases != nullptr) ? biases->info() : nullptr,
output->info(), num_groups));
- const DataType data_type = input->info()->data_type();
- const DataLayout data_layout = input->info()->data_layout();
+ const DataType data_type = input->info()->data_type();
_biases = biases;
_output = output;
@@ -101,8 +100,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *
build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
// Create kernel
- std::string kernel_name = std::string("reshape_to_columns_") + lower_string(string_from_data_layout(data_layout));
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts.options()));
// Set static arguments
unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 61010711a6..8cb4f4b889 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -49,29 +49,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
- TensorShape expected_output_shape;
- if(is_flatten) /* Called by FlattenLayer */
+ if(output->total_size() > 0)
{
- expected_output_shape = misc::shape_calculator::compute_im2col_flatten_shape(input);
- }
- else if(!is_fully_connected) /* Called by ConvolutionLayer */
- {
- expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false);
- }
- else /* Called by FullyConnectedLayer */
- {
- const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
- const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
+ TensorShape expected_output_shape;
- expected_output_shape = misc::shape_calculator::compute_im2col_fc_shape(input, num_input_dimensions);
- }
+ if(is_flatten || is_fully_connected)
+ {
+ expected_output_shape = misc::shape_calculator::compute_flatten_shape(input);
+ }
+ else
+ {
+ expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false);
+ }
- TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+ TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ }
return Status{};
}
diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index f5809a218a..b372c35dd9 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp
@@ -23,8 +23,7 @@
*/
#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
@@ -32,8 +31,13 @@ using namespace arm_compute;
void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output)
{
- auto k = arm_compute::support::cpp14::make_unique<CLIm2ColKernel>();
- k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+ auto k = arm_compute::support::cpp14::make_unique<CLFlattenLayerKernel>();
+ k->configure(input, output);
_kernel = std::move(k);
CLScheduler::get().tune_kernel_static(*_kernel);
}
+
+Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+ return CLFlattenLayerKernel::validate(input, output);
+} \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 6fd78a3fc9..60c28a0874 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -73,12 +73,11 @@ Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c
}
CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _im2col_kernel(), _convert_weights(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
- _accumulate_biases_kernel(), _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
+ : _memory_group(memory_manager), _convert_weights(), _flatten_layer(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+ _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true),
_is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), _original_weights(nullptr)
{
}
-
void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
{
if(_is_quantized)
@@ -111,20 +110,19 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
- // Initialize output tensor for im2col
- TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
- _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+ // Initialize output tensor for flatten
+ TensorShape shape_flatten = compute_flatten_shape(input->info());
+ _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW));
- // Configure im2col kernel
- _memory_group.manage(&_im2col_output);
- _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
- CLScheduler::get().tune_kernel_static(_im2col_kernel);
+ // Configure flatten kernel
+ _memory_group.manage(&_flatten_output);
+ _flatten_layer.configure(input, &_flatten_output);
// Configure matrix multiply kernel
- configure_mm(&_im2col_output, weights, output);
+ configure_mm(&_flatten_output, weights, output);
- // Allocate the output tensor for im2col once all the configure methods have been called
- _im2col_output.allocator()->allocate();
+ // Allocate the output tensor for flatten once all the configure methods have been called
+ _flatten_output.allocator()->allocate();
}
void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
@@ -254,7 +252,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const GPUTarget gpu_target = CLScheduler::get().target();
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)).set_data_layout(DataLayout::NCHW));
+ const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -311,9 +309,9 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
// Fully Connected layer after a Convolution Layer without batches
ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
- // Validate im2col kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false));
- input_to_use = &im2col_input;
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input));
+ input_to_use = &flatten_input;
}
else
{
@@ -341,7 +339,7 @@ void CLFullyConnectedLayer::run()
// Linearize input if it comes from a convolutional layer
if(_is_fc_after_conv)
{
- CLScheduler::get().enqueue(_im2col_kernel, false);
+ _flatten_layer.run();
}
// Run matrix multiply
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 1d1b17bbf1..a8d7058f2a 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -171,6 +171,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
+ ARM_COMPUTE_UNUSED(output);
// Check if we need to reshape the matrix B only on the first run
const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
@@ -180,7 +181,7 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
- TensorInfo tmp_output_info = *output->clone();
+ TensorInfo tmp_output_info{};
// Get the GPU target
const GPUTarget gpu_target = CLScheduler::get().target();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index fb90415e31..49549a0ad0 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -171,7 +171,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
const DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
const unsigned int kernel_width = weights->info()->dimension(idx_width);
@@ -193,7 +192,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
ICLTensor *gemm_output_to_use = output;
ICLTensor *gemm_output_staged_to_use = output;
- const unsigned bias_element = (_append_bias && !_skip_im2col) ? 1 : 0;
const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
// Get parameters from conv_info
@@ -212,7 +210,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
dilation);
unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element;
// _weights_reshaped will be auto configured in the kernel.
// Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
@@ -223,25 +220,13 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
// Create tensor to store im2col reshaped inputs
if(!_skip_im2col)
{
- // Calculate im2col shape
- // For OpenCL the batch size is on the third dimension
- // TODO (giaiod01): Use auto-init COMPMID-1277
- TensorShape shape_im2col = input->info()->tensor_shape();
- if(shape_im2col.num_dimensions() >= 3)
- {
- shape_im2col.remove_dimension(2);
- }
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
-
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo im2col_reshaped_info(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
- _im2col_output.allocator()->init(im2col_reshaped_info);
_memory_group.manage(&_im2col_output);
- // Configure and tune im2col
+ // Configure and tune im2col. im2col output shape is auto-initialized
_im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation);
+
+ // Set quantization info
+ _im2col_output.info()->set_quantization_info(input->info()->quantization_info());
CLScheduler::get().tune_kernel_static(_im2col_kernel);
// Update GEMM input
@@ -350,11 +335,10 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
const ITensorInfo *gemm_output_staged_to_use = output;
const ITensorInfo *weights_to_use = weights;
- const bool is_nhwc = data_layout == DataLayout::NHWC;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
- const bool append_bias = (biases != nullptr) && (!is_quantized);
- const unsigned bias_element = (append_bias && !skip_im2col) ? 1 : 0;
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
+ const bool append_bias = (biases != nullptr) && (!is_quantized);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -391,7 +375,6 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
dilation);
unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element;
// Output tensor auto inizialitation if not yet initialized
ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr));
@@ -400,19 +383,14 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
if(!skip_im2col)
{
- // Create tensor info for im2col reshaped inputs
- // For OpenCL the batch size is on the third dimension
- // TODO (giaiod01): Use auto-init COMPMID-1277
- TensorShape shape_im2col = input->tensor_shape();
- if(input->tensor_shape().num_dimensions() >= 3)
- {
- shape_im2col.remove_dimension(2);
- }
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
- im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+ const Size2D kernel_dims(kernel_width, kernel_height);
+
+ // Output tensor auto initialization if not yet initialized
+ TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true);
+
+ auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation));
gemm_input_to_use = &im2col_reshaped_info;
}
else if(append_bias)
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 25b8adc431..c2f0283d4e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -113,7 +113,7 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso
// If the fully connected layer is called after a convolution layer, the input tensor must be linearized
// Initialize output tensor for im2col
- TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
+ TensorShape shape_im2col = compute_flatten_shape(input->info());
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
// Configure im2col kernel
@@ -249,7 +249,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
bool is_fc_after_conv = true;
bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
- const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
+ const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -420,4 +420,4 @@ void NEFullyConnectedLayer::prepare()
_is_prepared = true;
}
-}
+} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index c0a5d0a436..df4a040bad 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -223,7 +223,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
{
// Calculate im2col shape
// For NEON the batch size is on the fourth dimension
- // TODO (giaiod01): Use auto-init COMPMID-1277
+ // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482
TensorShape shape_im2col = input->info()->tensor_shape();
shape_im2col.set(0, mat_weights_rows);
shape_im2col.set(1, conv_w * conv_h);
@@ -232,7 +232,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
_im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
_memory_group.manage(&_im2col_output);
- // Configure and tune im2col
+ // Configure
_im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation);
// Update GEMM input
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 937bd08c6f..4a6d778f9f 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -160,14 +160,14 @@ public:
: ShapeDataset("Shape",
{
// Batch size 1
- TensorShape{ 9U, 9U },
+ TensorShape{ 11U, 11U },
TensorShape{ 27U, 13U, 2U },
TensorShape{ 128U, 64U, 1U, 3U },
// Batch size 4
- TensorShape{ 9U, 9U, 3U, 4U },
+ TensorShape{ 11U, 11U, 3U, 4U },
TensorShape{ 27U, 13U, 2U, 4U },
// Arbitrary batch size
- TensorShape{ 9U, 9U, 3U, 5U }
+ TensorShape{ 11U, 11U, 3U, 5U }
})
{
}
diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp
index 9422fcc49b..291befa6cb 100644
--- a/tests/validation/CL/Im2Col.cpp
+++ b/tests/validation/CL/Im2Col.cpp
@@ -41,8 +41,18 @@ namespace validation
{
namespace
{
-const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) });
-const auto padstrides = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) });
+// *INDENT-OFF*
+// clang-format off
+const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U),
+ Size2D(5U, 5U),
+ Size2D(3U, 1U),
+ Size2D(1U, 3U),
+ Size2D(5U, 3U),
+ Size2D(1U, 1U),
+ Size2D(11U, 11U)} );
+const auto padstrides = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U),
+ PadStrideInfo(1U, 1U, 1U, 1U),
+ PadStrideInfo(2U, 2U, 0U, 2U) });
const auto conv_args = combine(combine(combine(conv_filter_sizes, padstrides),
framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
@@ -53,23 +63,19 @@ TEST_SUITE(Im2Col)
using CLIm2Col = CLSynthetizeFunction<CLIm2ColKernel>;
-// *INDENT-OFF*
-// clang-format off
DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
framework::dataset::make("InputInfo", { TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::U8), // Unsupported data type
TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::F32), // Mismatching data type
TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8), // Bias not supported with QASYMM8
- TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8), // Mismatching shapes
TensorInfo(TensorShape(10U, 12U, 2U, 2U), 1, DataType::QASYMM8),
}),
framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::F16),
TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::F16),
TensorInfo(TensorShape(3U, 3U, 10U, 2U), 1, DataType::QASYMM8),
- TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::QASYMM8),
- TensorInfo(TensorShape(18U, 80U, 1U, 2U), 1, DataType::QASYMM8),
+ TensorInfo(TensorShape(18U, 80U, 2U, 1U), 1, DataType::QASYMM8),
})),
- framework::dataset::make("HasBias", { true, true, true, false, false })),
- framework::dataset::make("Expected", { false, false, false, true, true })),
+ framework::dataset::make("HasBias", { true, true, true, false })),
+ framework::dataset::make("Expected", { false, false, false, true })),
input_info, output_info, has_bias, expected)
{
@@ -83,16 +89,18 @@ template <typename T>
using CLIm2ColFixture = Im2ColValidationFixture<CLTensor, CLAccessor, CLIm2Col, T, true>;
TEST_SUITE(Float)
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
{
// Validate output
validate(CLAccessor(_target), _reference);
}
TEST_SUITE_END()
-FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
{
// Validate output
validate(CLAccessor(_target), _reference);
@@ -101,14 +109,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode:
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
{
// Validate output
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
{
// Validate output
validate(CLAccessor(_target), _reference);
@@ -120,14 +130,16 @@ TEST_SUITE_END()
TEST_SUITE_END()
TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", true)))
{
// Validate output
validate(CLAccessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/LocallyConnected.cpp b/tests/validation/CL/LocallyConnected.cpp
index 5381072131..dbfe4e269f 100644
--- a/tests/validation/CL/LocallyConnected.cpp
+++ b/tests/validation/CL/LocallyConnected.cpp
@@ -59,6 +59,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/bias
TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/output
TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Asymmetric padding
+ TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Padding required
TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32)
}),
framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16),
@@ -68,7 +69,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
- TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32)
+ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32),
+ TensorInfo(TensorShape(1U, 3U, 5U, 21U, 575U), 1, DataType::F32)
})),
framework::dataset::make("BiasInfo", { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
TensorInfo(TensorShape(21U, 275U), 1, DataType::F16),
@@ -77,7 +79,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(21U, 274U), 1, DataType::F32),
TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
- TensorInfo(TensorShape(21U, 275U), 1, DataType::F32)
+ TensorInfo(TensorShape(21U, 275U), 1, DataType::F32),
+ TensorInfo(TensorShape(21U, 575U), 1, DataType::F32)
})),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
@@ -86,7 +89,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32),
TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
- TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32)
+ TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32),
+ TensorInfo(TensorShape(23U, 25U, 21U), 1, DataType::F32)
})),
framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0),
PadStrideInfo(2, 1, 0, 0),
@@ -94,10 +98,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
PadStrideInfo(2, 1, 0, 0),
PadStrideInfo(2, 1, 0, 0),
PadStrideInfo(2, 1, 0, 0),
- PadStrideInfo(2, 1, 1, 0, 0, 0, DimensionRoundingType::FLOOR),
- PadStrideInfo(2, 1, 0, 0)
+ PadStrideInfo(2, 1, 1, 0),
+ PadStrideInfo(2, 1, 0, 0),
+ PadStrideInfo(1, 1, 0, 0)
})),
- framework::dataset::make("Expected", { false, false, false, false, false, false, false, true })),
+ framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true })),
input_info, weights_info, bias_info, output_info, conv_info, expected)
{
bool is_valid = bool(CLLocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false),
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index bff8634f8f..f011ebe935 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -77,14 +77,16 @@ using NEIm2ColFixture = Im2ColValidationFixture<Tensor, Accessor, NEIm2Col, T, f
TEST_SUITE(Float)
TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(Accessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(Accessor(_target), _reference);
@@ -94,14 +96,16 @@ TEST_SUITE_END()
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(Accessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(Accessor(_target), _reference);
@@ -113,14 +117,16 @@ TEST_SUITE_END()
TEST_SUITE_END()
TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(Accessor(_target), _reference);
}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
- conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)),
+ conv_args),
+ framework::dataset::make("ChannelsFirstOutputNHWC", false)))
{
// Validate output
validate(Accessor(_target), _reference);
diff --git a/tests/validation/fixtures/FlattenLayerFixture.h b/tests/validation/fixtures/FlattenLayerFixture.h
index f273e9315c..d17080695b 100644
--- a/tests/validation/fixtures/FlattenLayerFixture.h
+++ b/tests/validation/fixtures/FlattenLayerFixture.h
@@ -55,7 +55,7 @@ public:
{
TensorShape shape_flatten;
TensorInfo input_info(shape, 1, data_type);
- shape_flatten = compute_im2col_flatten_shape(&input_info);
+ shape_flatten = compute_flatten_shape(&input_info);
_target = compute_target(shape, shape_flatten, data_type);
_reference = compute_reference(shape, shape_flatten, data_type);
diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
index f72e38fefc..da2576b37c 100644
--- a/tests/validation/fixtures/Im2ColFixture.h
+++ b/tests/validation/fixtures/Im2ColFixture.h
@@ -49,7 +49,8 @@ class Im2ColValidationFixture : public framework::Fixture
{
public:
template <typename...>
- void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout)
+ void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout,
+ bool channels_first_output_nhwc)
{
_kernel_dims = kernel_dims;
_conv_info = conv_info;
@@ -68,7 +69,7 @@ public:
const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z);
_target = compute_target(input_shape, output_shape, data_type);
- compute_reference(input_shape, output_shape, data_type);
+ compute_reference(input_shape, output_shape, data_type, channels_first_output_nhwc);
}
protected:
@@ -107,14 +108,16 @@ protected:
return dst;
}
- void compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+ void compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type, bool channels_first_output_nhwc)
{
// Create reference
SimpleTensor<T> src{ input_shape, data_type, 1, _quant_info, _data_layout };
_reference = SimpleTensor<T>(output_shape, data_type, 1, _quant_info, DataLayout::NCHW);
+
// Fill reference
fill(src);
- reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias);
+
+ reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias, channels_first_output_nhwc);
}
TensorType _target{};
SimpleTensor<T> _reference{};
diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
index 83ef8b40a5..2459499474 100644
--- a/tests/validation/reference/Im2Col.cpp
+++ b/tests/validation/reference/Im2Col.cpp
@@ -23,8 +23,6 @@
*/
#include "Im2Col.h"
-#include "Permute.h"
-
#include "arm_compute/core/Types.h"
#include "tests/validation/Helpers.h"
#include "tests/validation/reference/Utils.h"
@@ -41,46 +39,45 @@ template <typename T>
void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
{
ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
- // Create reference
- const int pad_x = conv_info.pad().first;
- const int pad_y = conv_info.pad().second;
const int stride_x = conv_info.stride().first;
const int stride_y = conv_info.stride().second;
const int kernel_width = kernel_dims.width;
const int kernel_height = kernel_dims.height;
+ const int pad_x = conv_info.pad().first;
+ const int pad_y = conv_info.pad().second;
const int src_width = src.shape().x();
const int src_height = src.shape().y();
- const int src_depth = src.shape().z();
+ const int src_channels = src.shape().z();
const int batches = src.shape().total_size_upper(3);
+ const int dst_height = dst.shape().y();
const int pad_val = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+ int dst_idx = 0;
- int dst_idx = 0;
- // dst[dst_idx++] will write out of bounds if kernel_height == kernel_width == 1 because lasty will be the bottom padding row
- // and this is not present in the dst buffer
- const int lasty = src_height + (kernel_height > 1 ? pad_y : 0) - kernel_height;
- const int lastx = src_width + (kernel_width > 1 ? pad_x : 0) - kernel_width;
+ // Compute width and height of the convolved tensors
+ std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src_width, src_height, kernel_dims.width, kernel_dims.height, conv_info);
for(int b = 0; b < batches; ++b)
{
- for(int y = -pad_y; y <= lasty; y += stride_y)
+ for(int yo = 0; yo < dst_height; ++yo)
{
- for(int x = -pad_x; x <= lastx; x += stride_x)
+ // Compute input spatial coordinates
+ const int xi = (yo % convolved_dims.first) * stride_x;
+ const int yi = (yo / convolved_dims.first) * stride_y;
+
+ for(int ci = 0; ci < src_channels; ++ci)
{
- for(int z = 0; z < src_depth; ++z)
+ for(int yk = 0; yk < kernel_height; ++yk)
{
- for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y)
+ for(int xk = 0; xk < kernel_width; ++xk)
{
- for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x)
- {
- dst[dst_idx++] = tensor_elem_at(src, Coordinates(patch_x, patch_y, z, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
- }
+ dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
}
}
+ }
- if(has_bias)
- {
- dst[dst_idx++] = static_cast<T>(1);
- }
+ if(has_bias)
+ {
+ dst[dst_idx++] = static_cast<T>(1);
}
}
}
@@ -133,7 +130,56 @@ void im2col_nhwc(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D
}
template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void im2col_nhwc_channel_first(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+ ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC);
+ const int stride_x = conv_info.stride().first;
+ const int stride_y = conv_info.stride().second;
+ const int kernel_width = kernel_dims.width;
+ const int kernel_height = kernel_dims.height;
+ const int pad_x = conv_info.pad().first;
+ const int pad_y = conv_info.pad().second;
+ const int src_width = src.shape().y();
+ const int src_height = src.shape().z();
+ const int src_channels = src.shape().x();
+ const int batches = src.shape().total_size_upper(3);
+ const int dst_width = has_bias ? dst.shape().x() - 1 : dst.shape().x();
+ const int dst_height = dst.shape().y();
+ const int pad_val = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0;
+
+ // Compute width and height of the convolved tensors
+ std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src_width, src_height, kernel_dims.width, kernel_dims.height, conv_info);
+
+ for(int b = 0; b < batches; ++b)
+ {
+ for(int yo = 0; yo < dst_height; ++yo)
+ {
+ // Compute input spatial coordinates
+ const int xi = (yo % convolved_dims.first) * stride_x;
+ const int yi = (yo / convolved_dims.first) * stride_y;
+
+ for(int ci = 0; ci < src_channels; ++ci)
+ {
+ for(int yk = 0; yk < kernel_height; ++yk)
+ {
+ for(int xk = 0; xk < kernel_width; ++xk)
+ {
+ dst[ci + (xk + yk * kernel_width) * src_channels + yo * dst.shape().x() + b * dst.shape().x() * dst.shape().y()] = tensor_elem_at(src, Coordinates(ci, xi + xk - pad_x, yi + yk - pad_y, b),
+ BorderMode::CONSTANT, static_cast<T>(pad_val));
+ }
+ }
+ }
+
+ if(has_bias)
+ {
+ dst[dst_width + yo * dst.shape().x() + b * dst.shape().x() * dst.shape().y()] = static_cast<T>(1);
+ }
+ }
+ }
+}
+
+template <typename T>
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc)
{
switch(src.data_layout())
{
@@ -144,7 +190,14 @@ void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kern
}
case DataLayout::NHWC:
{
- im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
+ if(channels_first_output_nhwc)
+ {
+ im2col_nhwc_channel_first(src, dst, kernel_dims, conv_info, has_bias);
+ }
+ else
+ {
+ im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias);
+ }
break;
}
default:
@@ -155,9 +208,9 @@ void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kern
}
}
-template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
-template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
-template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
} // namespace reference
} // namespace validation
} // namespace test
diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
index 5277171a2f..b1ebaf25da 100644
--- a/tests/validation/reference/Im2Col.h
+++ b/tests/validation/reference/Im2Col.h
@@ -35,7 +35,7 @@ namespace validation
namespace reference
{
template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc = false);
} // namespace reference
} // namespace validation
} // namespace test