From 215b4ea6c9dee480a22070d5873b0b8cb52531a0 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 28 Jun 2018 16:29:29 +0100 Subject: COMPMID-1277 - Optimizing CLIm2ColKernel for NHWC. This patch includes: - Im2Col optimizations for NHWC using a new data layout - Refactoring of CLIm2ColKernel adding validation method and auto-init - Removed im2col_reduced from CLIm2ColKernel and created a new kernel CLFlattenLayerKernel Change-Id: I1620640b6796baa268324b33ae92cdd8de53e27c Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141241 Tested-by: Jenkins Reviewed-by: Giorgio Arena --- arm_compute/core/CL/CLKernels.h | 1 + arm_compute/core/CL/kernels/CLFlattenLayerKernel.h | 74 +++ arm_compute/core/CL/kernels/CLIm2ColKernel.h | 41 +- arm_compute/core/utils/misc/ShapeCalculator.h | 14 +- arm_compute/runtime/CL/functions/CLFlattenLayer.h | 17 +- .../runtime/CL/functions/CLFullyConnectedLayer.h | 6 +- src/core/CL/CLKernelLibrary.cpp | 21 +- src/core/CL/cl_kernels/convolution_layer.cl | 74 +-- src/core/CL/cl_kernels/flatten.cl | 57 +++ src/core/CL/cl_kernels/im2col.cl | 529 +++++++++++---------- src/core/CL/kernels/CLFlattenLayerKernel.cpp | 151 ++++++ src/core/CL/kernels/CLIm2ColKernel.cpp | 474 +++++++++--------- src/core/CL/kernels/CLWeightsReshapeKernel.cpp | 6 +- src/core/NEON/kernels/NEIm2ColKernel.cpp | 31 +- src/runtime/CL/functions/CLFlattenLayer.cpp | 12 +- src/runtime/CL/functions/CLFullyConnectedLayer.cpp | 34 +- src/runtime/CL/functions/CLGEMM.cpp | 3 +- .../CL/functions/CLGEMMConvolutionLayer.cpp | 54 +-- .../NEON/functions/NEFullyConnectedLayer.cpp | 6 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 4 +- tests/datasets/ShapeDatasets.h | 6 +- tests/validation/CL/Im2Col.cpp | 54 ++- tests/validation/CL/LocallyConnected.cpp | 17 +- tests/validation/NEON/Im2Col.cpp | 30 +- tests/validation/fixtures/FlattenLayerFixture.h | 2 +- tests/validation/fixtures/Im2ColFixture.h | 11 +- tests/validation/reference/Im2Col.cpp | 109 +++-- tests/validation/reference/Im2Col.h | 2 +- 28 files changed, 1027 insertions(+), 813 deletions(-) create mode 100644 arm_compute/core/CL/kernels/CLFlattenLayerKernel.h create mode 100644 src/core/CL/cl_kernels/flatten.cl create mode 100644 src/core/CL/kernels/CLFlattenLayerKernel.cpp diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index 737d8df4e8..ea942bde5d 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -62,6 +62,7 @@ #include "arm_compute/core/CL/kernels/CLErodeKernel.h" #include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h" #include "arm_compute/core/CL/kernels/CLFloorKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" diff --git a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h new file mode 100644 index 0000000000..a5cf6e0c33 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLFLATTENLAYERKERNEL_H__ +#define __ARM_COMPUTE_CLFLATTENLAYERKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL interface for the flatten kernel.*/ +class CLFlattenLayerKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLFlattenLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFlattenLayerKernel(const CLFlattenLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFlattenLayerKernel &operator=(const CLFlattenLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + CLFlattenLayerKernel(CLFlattenLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + CLFlattenLayerKernel &operator=(CLFlattenLayerKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input First input tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input + */ + void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLFlattenLayerKernel + * + * @param[in] input First input tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +public: + const ICLTensor *_input; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLFLATTENLAYERKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h index fc930abcbe..ae19319047 100644 --- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h @@ -96,48 +96,13 @@ public: // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; -private: - /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input) - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel. - */ - void run_reduced(const Window &window, cl::CommandQueue &queue); - /** run the generic convolution layer input reshape kernel - * - * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). - * @param[in,out] queue Command queue on which to enqueue the kernel. - */ - void run_generic(const Window &window, cl::CommandQueue &queue); - - /** Chooses and configure the right kernel for the given input arguments. - * - * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32 - * @param[in] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, - * while every dimension above represents a batch. Data types supported: Same as @p input - * @param[in] kernel_dims The kernel dimensions (width and height). - * @param[in] dilation Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] has_bias In case biases are provided expands the matrix with 1. - * @param[out] build_opts OpenCL buil program options. - * - * @return the name of the kernel chosen - */ - std::string configure_window(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, - const Size2D &dilation, const PadStrideInfo &conv_info, CLBuildOptions &build_opts); - - /** Common signature for the kernel to run */ - using Im2ColFunction = void (CLIm2ColKernel::*)(const Window &, cl::CommandQueue &); - public: const ICLTensor *_input; ICLTensor *_output; - PadStrideInfo _conv_info; std::pair _convolved_dims; - unsigned int _num_elems_processed_per_iteration; - Im2ColFunction _run_func; - Size2D _kernel_dims; + unsigned int _num_elems_processed_per_iteration; + Size2D _kernel_dims; + PadStrideInfo _conv_info; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 1e5b9afd0e..0a2a535502 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -215,19 +215,13 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz return output_shape; } -inline TensorShape compute_im2col_fc_shape(const ITensorInfo *input, const int num_input_dimensions = 3) +inline TensorShape compute_flatten_shape(const ITensorInfo *input) { - TensorShape output_shape{ input->tensor_shape() }; - - output_shape.collapse(num_input_dimensions); + // The output shape will be the flatten version of the input (i.e. [ width * height * channels, num_batches, ... ] ). Used for FlattenLayer and FullyConnectedLayer. - return output_shape; -} -inline TensorShape compute_im2col_flatten_shape(const ITensorInfo *input) -{ - // The output shape will be the flatten version of the input (i.e. [ width * height * channels, 1, 1, ... ] ). Used for FlattenLayer. TensorShape output_shape{ input->tensor_shape() }; - output_shape.collapse(3, 0); + + output_shape.collapse(3); return output_shape; } diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h index 88df4a7f96..ebc0e5e53f 100644 --- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h +++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h @@ -33,7 +33,7 @@ class ICLTensor; /** Basic function to execute flatten. This function calls the following OpenCL kernel: * -* -# @ref CLIm2ColKernel +* -# @ref CLFlattenLayerKernel * */ class CLFlattenLayer : public ICLSimpleFunction @@ -41,11 +41,22 @@ class CLFlattenLayer : public ICLSimpleFunction public: /** Initialise the kernel's input and output. * - * @param[in] input First input tensor to flatten with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: F16/F32 + * @param[in] input First input tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor with shape [w*h*d, input_batches] where: - * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input + * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLTranspose + * + * @param[in] input First input tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); }; } // namespace arm_compute diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index e8fe8e47a2..450cd831ee 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -27,11 +27,11 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" -#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h" +#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" @@ -130,14 +130,14 @@ private: void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); CLMemoryGroup _memory_group; - CLIm2ColKernel _im2col_kernel; CLConvertFullyConnectedWeights _convert_weights; + CLFlattenLayer _flatten_layer; CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel; CLGEMM _mm_gemm; CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; - CLTensor _im2col_output; + CLTensor _flatten_output; CLTensor _gemmlowp_output; CLTensor _converted_weights_output; CLTensor _reshape_weights_output; diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 64519ff459..29b01e6cea 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -226,6 +226,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" }, { "erode", "erode.cl" }, { "fast_corners", "fast_corners.cl" }, + { "flatten", "flatten.cl" }, { "fill_image_borders_constant", "fill_border.cl" }, { "fill_image_borders_replicate", "fill_border.cl" }, { "finalize", "optical_flow_pyramid_lk.cl" }, @@ -270,13 +271,12 @@ const std::map CLKernelLibrary::_kernel_program_map = { "hog_detector", "hog.cl" }, { "hog_orientation_binning", "hog.cl" }, { "hysteresis", "canny.cl" }, - { "im2col1x1_stridex1_dchw", "im2col.cl" }, - { "im2col3x3_dchw", "im2col.cl" }, - { "im2col5x5_dchw", "im2col.cl" }, - { "im2col11x11_padx0_pady0_dchw", "im2col.cl" }, - { "im2col_generic_dchw", "im2col.cl" }, - { "im2col_generic_padx0_pady0_dchw", "im2col.cl" }, - { "im2col_reduced_dchw", "im2col.cl" }, + { "im2col1x1_stridex1_nchw", "im2col.cl" }, + { "im2col3x3_nchw", "im2col.cl" }, + { "im2col5x5_nchw", "im2col.cl" }, + { "im2col11x11_padx0_pady0_nchw", "im2col.cl" }, + { "im2col_generic_nchw", "im2col.cl" }, + { "im2col_generic_padx0_pady0_nchw", "im2col.cl" }, { "im2col3x3_nhwc", "im2col.cl" }, { "im2col_generic_nhwc", "im2col.cl" }, { "init_level", "optical_flow_pyramid_lk.cl" }, @@ -333,8 +333,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "remap_nearest_neighbour", "remap.cl" }, { "remap_bilinear", "remap.cl" }, { "reshape_layer", "reshape_layer.cl" }, - { "reshape_to_columns_nchw", "convolution_layer.cl" }, - { "reshape_to_columns_nhwc", "convolution_layer.cl" }, + { "reshape_to_columns", "convolution_layer.cl" }, { "RGB888_to_IYUV_bt709", "color_convert.cl" }, { "RGB888_to_NV12_bt709", "color_convert.cl" }, { "RGB888_to_RGBA8888_bt709", "color_convert.cl" }, @@ -570,6 +569,10 @@ const std::map CLKernelLibrary::_program_source_map = { "fast_corners.cl", #include "./cl_kernels/fast_corners.clembed" + }, + { + "flatten.cl", +#include "./cl_kernels/flatten.clembed" }, { "fill_border.cl", diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl index 9335b047fe..2b75b45fe1 100644 --- a/src/core/CL/cl_kernels/convolution_layer.cl +++ b/src/core/CL/cl_kernels/convolution_layer.cl @@ -53,7 +53,7 @@ * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) */ -__kernel void reshape_to_columns_nchw( +__kernel void reshape_to_columns( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), #ifdef HAS_BIAS @@ -109,74 +109,4 @@ __kernel void reshape_to_columns_nchw( } } } - -/** This kernel reshapes the tensor's low three dimensions to single column - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] bias_ptr Pointer to the bias tensor. Same as @p src_ptr - * @param[in] bias_stride_x Stride of the bias tensor in X dimension (in bytes) - * @param[in] bias_step_x bias_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] bias_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] depth The depth of the input tensor - * @param[in] width The width of the input tensor - * @param[in] height The height of the input tensor - * @param[in] total_filters Total number of filters. 4th dimension of the weights matrix - */ -__kernel void reshape_to_columns_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), -#ifdef HAS_BIAS - VECTOR_DECLARATION(bias), -#endif /* HAS_BIAS */ - uint depth, uint width, uint height, uint total_filters, uint dst_stride_z) -{ - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - bool is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1)); - - __global uchar *tmp_src_ptr = src.ptr; - __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * width * dst_stride_y + get_global_id( - 0) * width * height * dst_stride_y; -#ifdef HAS_BIAS - __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes; -#endif /* HAS_BIAS */ - - if(is_last_thread) - { - for(uint i = 0; i < total_filters; ++i) - { - *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr); - -#ifdef HAS_BIAS - *((__global DATA_TYPE *)(tmp_dst_ptr + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr)); - tmp_bias_ptr += bias_stride_x; -#endif /* HAS_BIAS */ - tmp_src_ptr += height * src_stride_z; - tmp_dst_ptr += dst_stride_x; - } - } - else - { - for(uint i = 0; i < total_filters; ++i) - { - *((__global DATA_TYPE *)tmp_dst_ptr) = *((__global DATA_TYPE *)tmp_src_ptr); - tmp_src_ptr += height * src_stride_z; - tmp_dst_ptr += dst_stride_x; - } - } -} -#endif // defined(DATA_TYPE) && defined(NUM_GROUPS) \ No newline at end of file +#endif // defined(DATA_TYPE) diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl new file mode 100644 index 0000000000..df0f9c4886 --- /dev/null +++ b/src/core/CL/cl_kernels/flatten.cl @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) + +/** This opencl kernel flattens the first 3 dimensions of the input tensor + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void flatten( + TENSOR3D_DECLARATION(src), + VECTOR_DECLARATION(dst)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (int)SRC_WIDTH + get_global_id(2) * (int)(SRC_WIDTH * SRC_HEIGHT)) * sizeof( + DATA_TYPE); + + *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr); +} +#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) \ No newline at end of file diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl index d034b30b68..274ec20046 100644 --- a/src/core/CL/cl_kernels/im2col.cl +++ b/src/core/CL/cl_kernels/im2col.cl @@ -35,13 +35,12 @@ #error "Element size not support" #endif // ELEMENT_SIZE -#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 1x1 and the stride_x = 1 +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) +/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW * - * @note This kernel computes 4 elements * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * @@ -62,16 +61,16 @@ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col1x1_stridex1_dchw( +__kernel void im2col1x1_stridex1_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor - const uint yc = get_global_id(1); // y coordinate in the convolved tensor - const uint ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const uint batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor + const uint yc = get_global_id(1); // y coordinate in the convolved tensor + const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map + const uint batch = get_global_id(2) / SRC_DEPTH; // batch size // Clamp xc // The strategy clamps at "xc" as it will be a valid value for sure @@ -107,7 +106,7 @@ __kernel void im2col1x1_stridex1_dchw( *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3; #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f; *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f; @@ -116,18 +115,16 @@ __kernel void im2col1x1_stridex1_dchw( } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) +#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH) -#define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR)) - -#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) - -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5 +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) +#if defined(DILATION_X) && defined(DILATION_Y) +/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 @@ -151,183 +148,65 @@ __kernel void im2col1x1_stridex1_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_generic_nhwc( +__kernel void im2col_generic_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int src_stride_y_int = (int)src_stride_y; - const int src_stride_z_int = (int)src_stride_z; - const int xc = get_global_id(1); // x coordinate in the convolved tensor - const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor - const int ch = get_global_id(0); // input feature map - const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; const int yi = yc * STRIDE_Y - PAD_TOP; // Calculate output indices - const int xo = ch * KERNEL_HEIGHT * KERNEL_WIDTH; + const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; + __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; + // Linearize convolution elements for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) { - const int dilated_offset_y = yk * DILATION_Y; - const int y0 = yi + dilated_offset_y; - if(y0 >= 0 && y0 < SRC_HEIGHT) + int y = yi + yk * DILATION_Y; + for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) { - int xk; - for(xk = 0; xk < KERNEL_WIDTH; xk++) + int x = xi + xk * DILATION_X; +#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); +#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 + if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) { - const int dilated_offset_x = xk * DILATION_X; - const int x0 = xi + dilated_offset_x; - if(x0 >= 0 && x0 < SRC_WIDTH) - { - *((__global DATA_TYPE *)output_ptr) = PTR_TO_VALUE(input_ptr + dilated_offset_x * src_stride_y + dilated_offset_y * src_stride_z, DATA_TYPE); - } - else - { - *((__global DATA_TYPE *)output_ptr) = PAD_VALUE; - } - output_ptr += 1 * sizeof(DATA_TYPE); + *output_ptr = PAD_VALUE; } - } - else - { - for(int xk = 0; xk < KERNEL_WIDTH; xk++) + else { - *((__global DATA_TYPE *)output_ptr) = (DATA_TYPE)PAD_VALUE; - output_ptr += 1 * dst_stride_x; + *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); } +#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 } } -#ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) - { - *((__global DATA_TYPE *)output_ptr) = 1.0f; - output_ptr += 1 * dst_stride_x; - } -#endif // HAS_BIAS -} - -/** This kernel performs a reshaping of the input tensor (with layout NHWC) to a tensor used to perform convolution using GEMM when the kernel size is 3x3 - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 - * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 - * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 - * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 - * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). - */ -__kernel void im2col3x3_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), - uint src_stride_w, - uint dst_stride_w) -{ - const int src_stride_y_int = (int)src_stride_y; - const int src_stride_z_int = (int)src_stride_z; - const int xc = get_global_id(1); // x coordinate in the convolved tensor - const int yc = get_global_id(2) % CONVOLVED_HEIGHT; // y coordinate in the convolved tensor - const int ch = get_global_id(0); // input feature map - const int batch = get_global_id(2) / CONVOLVED_HEIGHT; // batch size - - // Calculate input indices - const int xi = xc * STRIDE_X - PAD_LEFT; - const int yi = yc * STRIDE_Y - PAD_TOP; - - // Calculate output indices - const int xo = ch * 9; // 3x3 - const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_y_int + yi * src_stride_z_int + ch * src_stride_x + batch * src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w; - - VEC_DATA_TYPE(DATA_TYPE, 3) - row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); - VEC_DATA_TYPE(DATA_TYPE, 3) - row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); - VEC_DATA_TYPE(DATA_TYPE, 3) - row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))(PAD_VALUE); - - const int3 y = (int3)yi + (int3)(0, 1, 2); - // Guard against reading outside the input buffer, there is no padding in Z so we check if ry is inside the buffer. - if(y.s0 >= 0 && y.s0 < SRC_HEIGHT) - { - row0 = (VEC_DATA_TYPE(DATA_TYPE, 3))( - PTR_TO_VALUE(input_ptr + 0 * src_stride_y, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 1 * src_stride_y, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 2 * src_stride_y, DATA_TYPE)); - } - - if(y.s1 >= 0 && y.s1 < SRC_HEIGHT) - { - row1 = (VEC_DATA_TYPE(DATA_TYPE, 3))( - PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 1 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 1 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 1 * src_stride_z, DATA_TYPE)); - } - - if(y.s2 >= 0 && y.s2 < SRC_HEIGHT) - { - row2 = (VEC_DATA_TYPE(DATA_TYPE, 3))( - PTR_TO_VALUE(input_ptr + 0 * src_stride_y + 2 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 1 * src_stride_y + 2 * src_stride_z, DATA_TYPE), - PTR_TO_VALUE(input_ptr + 2 * src_stride_y + 2 * src_stride_z, DATA_TYPE)); - } - -#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 - // Put 0 if the value is out-of-bound - const int3 x = (int3)xi + (int3)(0, 1, 2); - VEC_DATA_TYPE(COND_DATA_TYPE, 3) - cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 3)); - row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0); - row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond0); - row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond0); -#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 - vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr); - *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { - *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; + *output_ptr = 1.0f; } #endif // HAS_BIAS } +#endif // defined(DILATION_X) && defined(DILATION_Y) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 3x3 +/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 @@ -350,16 +229,16 @@ __kernel void im2col3x3_nhwc( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col3x3_dchw( +__kernel void im2col3x3_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; @@ -402,19 +281,19 @@ __kernel void im2col3x3_dchw( *((__global DATA_TYPE *)output_ptr + 8) = row2.s2; #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)output_ptr + 9) = 1.0f; } #endif // HAS_BIAS } -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 5x5 +/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 @@ -437,16 +316,16 @@ __kernel void im2col3x3_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col5x5_dchw( +__kernel void im2col5x5_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X - PAD_LEFT; @@ -576,20 +455,20 @@ __kernel void im2col5x5_dchw( } #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)output_ptr) = 1.0f; } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) -#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM when the kernel size is 11x11 +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) +/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DKERNEL_DEPTH: e.g. -DKERNEL_DEPTH=3 + * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * @@ -610,16 +489,16 @@ __kernel void im2col5x5_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col11x11_padx0_pady0_dchw( +__kernel void im2col11x11_padx0_pady0_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X; @@ -776,21 +655,21 @@ __kernel void im2col11x11_padx0_pady0_dchw( } #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *((__global DATA_TYPE *)output_ptr) = 1.0f; } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_DEPTH) +#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) -#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) -/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when - * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0. +#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW * * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4. * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3. + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 @@ -810,16 +689,16 @@ __kernel void im2col11x11_padx0_pady0_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_generic_padx0_pady0_dchw( +__kernel void im2col_generic_padx0_pady0_nchw( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int xc = get_global_id(0); // x coordinate in the convolved tensor + const int yc = get_global_id(1); // y coordinate in the convolved tensor + const int ch = get_global_id(2) % SRC_DEPTH; // input feature map + const int batch = get_global_id(2) / SRC_DEPTH; // batch size // Calculate input indices const int xi = xc * STRIDE_X; @@ -855,25 +734,25 @@ __kernel void im2col_generic_padx0_pady0_dchw( } /* End of loop over KERNEL_HEIGHT */ #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if(ch == (SRC_DEPTH - 1)) { *output_ptr = 1.0f; } #endif // HAS_BIAS } -#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) +#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) + +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED) -#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) -/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. +#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) + +/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC * + * @note This kernel computes VECTOR_SIZE elements * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DKERNEL_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DKERNEL_DEPTH=64 - * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 - * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 - * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 - * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 @@ -893,64 +772,154 @@ __kernel void im2col_generic_padx0_pady0_dchw( * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_generic_dchw( +__kernel void im2col3x3_nhwc( TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst), uint src_stride_w, uint dst_stride_w) { - const int xc = get_global_id(0); // x coordinate in the convolved tensor - const int yc = get_global_id(1); // y coordinate in the convolved tensor - const int ch = get_global_id(2) % KERNEL_DEPTH; // input feature map - const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size + const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size // Calculate input indices - const int xi = xc * STRIDE_X - PAD_LEFT; - const int yi = yc * STRIDE_Y - PAD_TOP; + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; - // Calculate output indices - const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT; - const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution - - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w; - __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo; - - // Linearize convolution elements - for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) - { - int y = yi + yk * DILATION_Y; - for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr) - { - int x = xi + xk * DILATION_X; -#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 - *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); -#else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 - if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) - { - *output_ptr = PAD_VALUE; - } - else - { - *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y)); - } -#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 - } - } + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int yi_coord = 0; + int3 offset = 0; + + // Clamp xi + int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT); +#if PAD_TOP != 0 || PAD_BOTTOM != 0 +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + xi_offset *= (int3)src_stride_y; + + // Out-of-bound condition for X + int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH); + + // yi == 0 + // Clamp yi + // yi_coord is casted to unsigned int in order to use just a min() operation + // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 + yi_coord = yi - (int)PAD_TOP; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT)); + values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0)); + values1 = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1)); + values2 = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2)); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 1 + // Clamp yi_coord (it can be negative if PAD_TOP > 1) + yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with zeros + y_cond = (int)((uint)(yi - (int)PAD_TOP + 1) >= (uint)(SRC_HEIGHT)); + values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0)); + values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1)); + values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2)); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 2 + // Clamp yi_coord + yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + y_cond = (int)((uint)(yi - (int)PAD_TOP + 2) >= (uint)(SRC_HEIGHT)); + values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0)); + values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1)); + values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2)); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // Store + VSTORE(VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH); + VSTORE(VECTOR_SIZE) + (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH); #ifdef HAS_BIAS - if(ch == (KERNEL_DEPTH - 1)) + if((ch + VECTOR_SIZE) >= SRC_DEPTH) { - *output_ptr = 1.0f; + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f; } #endif // HAS_BIAS } -#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) -/**This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when - * the kernel width and height are the same of width and height of the input tensor +/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -958,35 +927,75 @@ __kernel void im2col_generic_dchw( * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] width The width of the input tensor - * @param[in] height The height of the input tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). */ -__kernel void im2col_reduced_dchw( +__kernel void im2col_generic_nhwc( TENSOR3D_DECLARATION(src), - VECTOR_DECLARATION(dst), - uint width, uint height) + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) { - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + const int ch = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size - const uint image_size = width * height; + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int i = 0; + for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) + { + // Clamp yi_coord + int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP; + yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1)); - __global uchar *tmp_out_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * width + get_global_id(2) * image_size) * dst_stride_x; + // Out-of-bound condition for Y + int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT); - *((__global DATA_TYPE *)tmp_out_ptr) = *((__global DATA_TYPE *)src.ptr); + for(int xk = 0; xk < KERNEL_WIDTH; ++xk) + { + // Clamp xi_coord + int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT); + xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1)); + + // Out-of-bound condition for X + int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); + + int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z); + + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset)); + + // Replace with PAD_VALUE if the value is out-of-bound + values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition)); + + // Store + VSTORE(VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); + + i++; + } + } #ifdef HAS_BIAS - // If it is the last thread in the 3 dimensional workgroup - if(get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1)) + if((ch + VECTOR_SIZE) >= SRC_DEPTH) { - tmp_out_ptr += dst_stride_x; - *((__global DATA_TYPE *)tmp_out_ptr) = (DATA_TYPE)1.0f; + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f; } #endif // HAS_BIAS } +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED) #endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp new file mode 100644 index 0000000000..0b5feffcc9 --- /dev/null +++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::U16, DataType::S16, + DataType::U32, DataType::S32, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_flatten_shape(input)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input))); + + Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings + + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} +} // namespace + +CLFlattenLayerKernel::CLFlattenLayerKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLFlattenLayerKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + + _input = input; + _output = output; + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("flatten", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "flatten"; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); +} + +Status CLFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + return Status{}; +} + +void CLFlattenLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()->tensor_shape()); + + Window out_slice = out_window.first_slice_window_1D(); + Window in_slice = window.first_slice_window_3D(); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice, _lws_hint); + } + while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); +} +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index a09129bba6..39654e2190 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -31,7 +31,6 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" @@ -40,12 +39,22 @@ #include #include +#include using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation) +struct Im2ColConfiguration +{ + std::string kernel_name{}; + std::set build_options{}; + unsigned int num_elems_processed_per_iteration{}; + bool is_padding_required_nchw{}; +}; + +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -54,263 +63,255 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, b ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - // Checks performed when output is configured - if(output->total_size() != 0) + if(output->total_size() > 0) { + const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } return Status{}; } -inline bool run_im2col_reduced(ITensorInfo *input, ITensorInfo *output, const PadStrideInfo &conv_info) +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw) { - int stride_x = 0; - int stride_y = 0; + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - std::tie(stride_x, stride_y) = conv_info.stride(); + // Output tensor auto initialization if not yet initialized + TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, true); - return (output->dimension(0) == (input->dimension(0) * input->dimension(1) * input->dimension(2))) && (TensorShape::num_max_dimensions >= 4) - && (std::equal(input->tensor_shape().cbegin() + 3, - input->tensor_shape().cend(), - output->tensor_shape().cbegin() + 1)) - && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding()); -} + auto_init_if_empty(*output, input->clone()->set_tensor_shape(expected_output_shape)); -} // namespace + const DataLayout data_layout = input->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int input_width = input->dimension(width_idx); + const unsigned int input_height = input->dimension(height_idx); -CLIm2ColKernel::CLIm2ColKernel() - : _input(nullptr), _output(nullptr), _conv_info(), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr), _kernel_dims() -{ -} + // Configure the execute window based on the selected optimal OpenCL kernel + bool window_changed = false; + Window win; -std::string -CLIm2ColKernel::configure_window(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, - const Size2D &dilation, const PadStrideInfo &conv_info, CLBuildOptions &build_opts) -{ - std::string kernel_name; - bool is_optimized_path = false; - const bool reduced = run_im2col_reduced(input->info(), output->info(), conv_info); - const DataType data_type = input->info()->data_type(); - const bool squared_im2col = kernel_dims.width == kernel_dims.height; - const DataLayout data_layout = input->info()->data_layout(); + if(data_layout == DataLayout::NHWC) + { + win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const unsigned int input_width = input->info()->dimension(width_idx); - const unsigned int input_height = input->info()->dimension(height_idx); - const unsigned int input_channel = input->info()->dimension(channel_idx); + const int xin_start = 0; + const int xin_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration) : input->dimension(0); + const int yin_start = 0; + const int yin_end = input->dimension(1); - if(!reduced) + const int xout_start = 0; + const int xout_end = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration) : output->dimension(0); + const int yout_start = 0; + const int yout_end = output->dimension(1); + + AccessWindowStatic input_access(input, xin_start, yin_start, xin_end, yin_end); + AccessWindowStatic output_access(output, xout_start, yout_start, xout_end, yout_end); + window_changed = window_changed || update_window_and_padding(win, input_access, output_access); + } + else { - // Default kernel name - if(data_layout == DataLayout::NCHW) + if(is_padding_required_nchw) { - kernel_name = "im2col_generic_dchw"; + const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); + win = calculate_max_window(*input, + Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); + AccessWindowStatic input_access(input, + -border.left, + -border.top, + ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), + input_height + border.bottom); + window_changed = window_changed || update_window_and_padding(win, input_access); } else { - kernel_name = "im2col_generic_nhwc"; + // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so + // update_window_and_padding() can be skipped + win = calculate_max_window(*input, Steps()); } + } + + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension + win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); - _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); - - build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width)); - build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height)); - build_opts.add_option("-DKERNEL_DEPTH=" + support::cpp11::to_string(input_channel)); - build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first)); - build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second)); - build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first)); - build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second)); - build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); - build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); - build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right())); - build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom())); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); - build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); - build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); - build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0"); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) +{ + const DataLayout data_layout = input->data_layout(); + const DataType data_type = input->data_type(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const unsigned int input_width = input->dimension(width_idx); + const unsigned int input_height = input->dimension(height_idx); + const unsigned int input_channel = input->dimension(channel_idx); + + const std::pair convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + + // Im2Col configuration + std::string kernel_name = "im2col_generic_"; + CLBuildOptions build_opts; + unsigned int num_elems_processed_per_iteration = 1; + bool is_padding_required_nchw = false; + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->element_size())); + build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width)); + build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height)); + build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first)); + build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second)); + build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first)); + build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second)); + build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); + build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); + build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right())); + build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom())); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); + build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel)); + build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); + build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); + build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->quantization_info().offset), "-DPAD_VALUE=0"); + build_opts.add_option_if(has_bias, "-DHAS_BIAS"); + + if(data_layout == DataLayout::NHWC) + { + num_elems_processed_per_iteration = 2; + is_padding_required_nchw = false; + + // Only the 3x3 case is optimized for NHWC + if(kernel_dims == Size2D(3U, 3U)) + { + kernel_name = "im2col3x3_"; + } + + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast(input_channel - num_elems_processed_per_iteration), 0))); + } + else + { if(dilation == Size2D(1U, 1U)) { + const bool squared_im2col = kernel_dims.width == kernel_dims.height; if(squared_im2col) { - // Check if we can run an optimized im2col + // Check if we can run an optimized im2col for NCHW switch(kernel_dims.width) { case 1: // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false - if(conv_info.stride().first == 1 && !conv_info.has_padding() && data_layout == DataLayout::NCHW) + if(conv_info.stride().first == 1 && !conv_info.has_padding()) { - // Set hint for LWS - _lws_hint = cl::NDRange(1, 1, 8); - _num_elems_processed_per_iteration = 4; - is_optimized_path = true; - kernel_name = "im2col1x1_stridex1_dchw"; + kernel_name = "im2col1x1_stridex1_"; + num_elems_processed_per_iteration = 4; + is_padding_required_nchw = true; } break; case 3: - _lws_hint = cl::NDRange(1, 1, 8); - _num_elems_processed_per_iteration = 1; - is_optimized_path = true; - switch(data_layout) - { - case DataLayout::NCHW: - kernel_name = "im2col3x3_dchw"; - break; - case DataLayout::NHWC: - kernel_name = "im2col3x3_nhwc"; - break; - default: - ARM_COMPUTE_ERROR("Not supported."); - break; - } + kernel_name = "im2col3x3_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = true; break; case 5: - _num_elems_processed_per_iteration = 1; - switch(data_layout) - { - case DataLayout::NCHW: - is_optimized_path = true; - kernel_name = "im2col5x5_dchw"; - break; - default: - // using generic_nhwc - is_optimized_path = false; - break; - } + kernel_name = "im2col5x5_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = true; break; case 11: - _num_elems_processed_per_iteration = 1; // Optimized im2col11x11 if pad_x = pad_y = 0 - if(!conv_info.has_padding() && data_layout == DataLayout::NCHW) + if(!conv_info.has_padding()) { - is_optimized_path = true; - kernel_name = "im2col11x11_padx0_pady0_dchw"; + kernel_name = "im2col11x11_padx0_pady0_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = true; } break; default: - is_optimized_path = false; + kernel_name = "im2col_generic_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = false; break; } } else if(kernel_dims.width > 1 && !conv_info.has_padding()) { - _num_elems_processed_per_iteration = 1; - is_optimized_path = false; - - if(data_layout == DataLayout::NCHW) - { - kernel_name = "im2col_generic_padx0_pady0_dchw"; - - // Optimized im2col is performed using one or more vector operations with the specified vector size - // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4 - // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3. - // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3. - // Using the vector size of 8, however, may be faster. - size_t vector_size = 4; - // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0 - // is used instead.) - if(kernel_dims.width < vector_size) - { - vector_size = kernel_dims.width; - } - // Vector size optimized for the 11x11 AlexNet convolution on Bifrost. - const GPUTarget gpu_target = get_target(); - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && kernel_dims.width == 11) - { - vector_size = 8; - } - const size_t width_mod_vector_size = kernel_dims.width % vector_size; - build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); - build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size)); - } + kernel_name = "im2col_generic_padx0_pady0_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = false; + + // Optimized im2col is performed using one or more vector operations with the specified vector size + // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4 + // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3. + // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3. + // Using the vector size of 8, however, may be faster. + // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0 + // is used instead.) + const size_t vector_size = std::min(static_cast(4), kernel_dims.width); + const size_t width_mod_vector_size = kernel_dims.width % vector_size; + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size)); } } - _run_func = &CLIm2ColKernel::run_generic; - } - else - { - _num_elems_processed_per_iteration = 1; - kernel_name = "im2col_reduced_dchw"; - _run_func = &CLIm2ColKernel::run_reduced; - } - // Configure kernel window - Window win; - if(is_optimized_path) - { - if(data_layout == DataLayout::NHWC) - { - win = calculate_max_window(*input->info(), - Steps(_num_elems_processed_per_iteration), - false, - BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left())); - const int x = -conv_info.pad_left(); - const int y = -conv_info.pad_top(); - const int h = kernel_dims.width * _num_elems_processed_per_iteration; - const int w = 1; - AccessWindowRectangle input_access(input->info(), x, y, w, h); - update_window_and_padding(win, input_access); - } - else - { - const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - win = calculate_max_window(*input->info(), - Steps(_num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); - AccessWindowStatic input_access(input->info(), - -border.left, - -border.top, - ceil_to_multiple(input_width + border.right, kernel_dims.width * _num_elems_processed_per_iteration), - input_height + border.bottom); - update_window_and_padding(win, input_access); - } - } - else - { - // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so - // update_window_and_padding() can be skipped - win = calculate_max_window(*input->info(), Steps()); } - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - if(!reduced) - { - // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension - win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); - } - ICLKernel::configure(win); - return kernel_name; + // Append the data layout to the kernel_name + kernel_name += lower_string(string_from_data_layout(data_layout)); + + Im2ColConfiguration im2col_config; + im2col_config.kernel_name = kernel_name; + im2col_config.build_options = build_opts.options(); + im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration; + im2col_config.is_padding_required_nchw = is_padding_required_nchw; + + return im2col_config; +} +} // namespace + +CLIm2ColKernel::CLIm2ColKernel() + : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info() +{ } void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation)); - _input = input; - _output = output; - _kernel_dims = kernel_dims; - _conv_info = conv_info; + const DataLayout data_layout = input->info()->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int input_width = input->info()->dimension(width_idx); + const unsigned int input_height = input->info()->dimension(height_idx); - const DataType data_type = input->info()->data_type(); + // Select and configure the optimal OpenCL kernel to run. + // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration + // and the padding requirement flag + Im2ColConfiguration im2col_config = configure_opencl_kernel(input->info(), kernel_dims, conv_info, has_bias, dilation); // Create kernel - CLBuildOptions build_opts; - build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); - build_opts.add_option_if(has_bias, "-DHAS_BIAS"); + _kernel = static_cast(CLKernelLibrary::get().create_kernel(im2col_config.kernel_name, im2col_config.build_options)); - _num_elems_processed_per_iteration = 1; + _input = input; + _output = output; + _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration; + _kernel_dims = kernel_dims; // Only needed by the Tuner + _conv_info = conv_info; // Only needed by the Tuner - const std::string kernel_name = configure_window(input, output, kernel_dims, dilation, conv_info, build_opts); - // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + im2col_config.is_padding_required_nchw); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); // Set config_id for enabling LWS tuning - _config_id = kernel_name; + _config_id = im2col_config.kernel_name; _config_id += "_"; _config_id += lower_string(string_from_data_type(input->info()->data_type())); _config_id += "_"; @@ -323,31 +324,24 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation) { - ARM_COMPUTE_UNUSED(kernel_dims); - ARM_COMPUTE_UNUSED(conv_info); - ARM_COMPUTE_UNUSED(has_bias); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation)); + Im2ColConfiguration im2col_config = configure_opencl_kernel(input, kernel_dims, conv_info, has_bias, dilation); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + im2col_config.is_padding_required_nchw) + .first); return Status{}; } void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON(_run_func == nullptr); - (this->*_run_func)(window, queue); -} - -void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - const DataLayout data_layout = _input->info()->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const DataLayout data_layout = _input->info()->data_layout(); // Get initial windows + // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - // Change the Z dimension's step back to 1 window_collapsed.set_dimension_step(Window::DimZ, 1); Window window_output; @@ -359,36 +353,32 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) Window slice_in = first_slice_3d; Window slice_out = window_output.first_slice_window_2D(); - const bool out_dim_not_same_input_dim = _convolved_dims.first != _input->info()->dimension(width_idx) || _convolved_dims.second != _input->info()->dimension(height_idx); + if(data_layout == DataLayout::NHWC) + { + const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); + const int num_batches = tmp_win[3].end(); - // Setup slice if convolved dims are not the same as input dims - if(out_dim_not_same_input_dim) + slice.set(1, Window::Dimension(0, static_cast(_output->info()->tensor_shape()[1]), 1)); + slice.set(2, Window::Dimension(0, static_cast(num_batches), 1)); + } + else { - // If the stride_x or stride_y are not 1, the output tensor of matrix multiply (Convolved tensor) will not - // have the same shape of the im2col input tensor - // In this case we need to re-compute the window using the shape of the tensor after matrix multiply (convolved_dims) - slice.set(width_idx, Window::Dimension(0, static_cast(_convolved_dims.first), 1)); - if(data_layout == DataLayout::NHWC) - { - // if layout is NHWC, we need to multiply convolved_dims.height by the number of batches as for this - // format we collapsed HEIGHT and all subsequent dimensions (batches) together. This is necessary to ensure - // global_id(2) values are in the correct range. - const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); - const int num_batches = tmp_win[3].end(); - slice.set(height_idx, Window::Dimension(0, static_cast(_convolved_dims.second) * num_batches, 1)); - } - else - { - slice.set(height_idx, Window::Dimension(0, static_cast(_convolved_dims.second), 1)); - } + slice.set(0, Window::Dimension(0, static_cast(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration)); + slice.set(1, Window::Dimension(0, static_cast(_convolved_dims.second), 1)); + // Note: In case of NCHW the 3rd dimension is already set collapsing the input window } // Setup input slice - // The first three dimensions of the input are increased by the inner loops + // The dimensions of the input are increased within the OpenCL kernel slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + // Setup output slice + // The dimensions of the output are increased within the OpenCL kernel + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + do { unsigned int idx = 0; @@ -399,30 +389,4 @@ void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue) enqueue(queue, *this, slice, _lws_hint); } while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); -} - -void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window out_window; - out_window.use_tensor_dimensions(_output->info()->tensor_shape()); - - Window out_slice = out_window.first_slice_window_1D(); - Window in_slice = window.first_slice_window_3D(); - - // Run kernel - do - { - // Set arguments - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, in_slice); - add_1D_tensor_argument(idx, _output, out_slice); - - _kernel.setArg(idx++, _input->info()->dimension(0)); - _kernel.setArg(idx++, _input->info()->dimension(1)); - enqueue(queue, *this, in_slice, _lws_hint); - } - while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); -} +} \ No newline at end of file diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp index 9df91fccde..58ecd9ccb3 100644 --- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp +++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp @@ -87,8 +87,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor * (biases != nullptr) ? biases->info() : nullptr, output->info(), num_groups)); - const DataType data_type = input->info()->data_type(); - const DataLayout data_layout = input->info()->data_layout(); + const DataType data_type = input->info()->data_type(); _biases = biases; _output = output; @@ -101,8 +100,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor * build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS"); // Create kernel - std::string kernel_name = std::string("reshape_to_columns_") + lower_string(string_from_data_layout(data_layout)); - _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + _kernel = static_cast(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts.options())); // Set static arguments unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor(); diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 61010711a6..8cb4f4b889 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -49,29 +49,26 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); - TensorShape expected_output_shape; - if(is_flatten) /* Called by FlattenLayer */ + if(output->total_size() > 0) { - expected_output_shape = misc::shape_calculator::compute_im2col_flatten_shape(input); - } - else if(!is_fully_connected) /* Called by ConvolutionLayer */ - { - expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false); - } - else /* Called by FullyConnectedLayer */ - { - const int num_batch_dimensions = std::max(0, static_cast(output->tensor_shape().num_dimensions()) - 1); - const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions; + TensorShape expected_output_shape; - expected_output_shape = misc::shape_calculator::compute_im2col_fc_shape(input, num_input_dimensions); - } + if(is_flatten || is_fully_connected) + { + expected_output_shape = misc::shape_calculator::compute_flatten_shape(input); + } + else + { + expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false); + } - TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); + TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } return Status{}; } diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp index f5809a218a..b372c35dd9 100644 --- a/src/runtime/CL/functions/CLFlattenLayer.cpp +++ b/src/runtime/CL/functions/CLFlattenLayer.cpp @@ -23,8 +23,7 @@ */ #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" -#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" -#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "support/ToolchainSupport.h" @@ -32,8 +31,13 @@ using namespace arm_compute; void CLFlattenLayer::configure(const ICLTensor *input, ICLTensor *output) { - auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output); _kernel = std::move(k); CLScheduler::get().tune_kernel_static(*_kernel); } + +Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + return CLFlattenLayerKernel::validate(input, output); +} \ No newline at end of file diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 6fd78a3fc9..60c28a0874 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -73,12 +73,11 @@ Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, c } CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _im2col_kernel(), _convert_weights(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), - _accumulate_biases_kernel(), _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), + : _memory_group(memory_manager), _convert_weights(), _flatten_layer(), _reshape_weights_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), + _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false), _is_prepared(false), _original_weights(nullptr) { } - void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output) { if(_is_quantized) @@ -111,20 +110,19 @@ void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLT // If the fully connected layer is called after a convolution layer, the input tensor must be linearized - // Initialize output tensor for im2col - TensorShape shape_im2col = compute_im2col_fc_shape(input->info()); - _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW)); + // Initialize output tensor for flatten + TensorShape shape_flatten = compute_flatten_shape(input->info()); + _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten).set_data_layout(DataLayout::NCHW)); - // Configure im2col kernel - _memory_group.manage(&_im2col_output); - _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false); - CLScheduler::get().tune_kernel_static(_im2col_kernel); + // Configure flatten kernel + _memory_group.manage(&_flatten_output); + _flatten_layer.configure(input, &_flatten_output); // Configure matrix multiply kernel - configure_mm(&_im2col_output, weights, output); + configure_mm(&_flatten_output, weights, output); - // Allocate the output tensor for im2col once all the configure methods have been called - _im2col_output.allocator()->allocate(); + // Allocate the output tensor for flatten once all the configure methods have been called + _flatten_output.allocator()->allocate(); } void CLFullyConnectedLayer::configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output) @@ -254,7 +252,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); const GPUTarget gpu_target = CLScheduler::get().target(); - const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)).set_data_layout(DataLayout::NCHW)); + const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)).set_data_layout(DataLayout::NCHW)); const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); @@ -311,9 +309,9 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn // Fully Connected layer after a Convolution Layer without batches ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2)))); - // Validate im2col kernel - ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false)); - input_to_use = &im2col_input; + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &flatten_input)); + input_to_use = &flatten_input; } else { @@ -341,7 +339,7 @@ void CLFullyConnectedLayer::run() // Linearize input if it comes from a convolutional layer if(_is_fc_after_conv) { - CLScheduler::get().enqueue(_im2col_kernel, false); + _flatten_layer.run(); } // Run matrix multiply diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index 1d1b17bbf1..a8d7058f2a 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -171,6 +171,7 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); // Check if we need to reshape the matrix B only on the first run const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); @@ -180,7 +181,7 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; - TensorInfo tmp_output_info = *output->clone(); + TensorInfo tmp_output_info{}; // Get the GPU target const GPUTarget gpu_target = CLScheduler::get().target(); diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp index fb90415e31..49549a0ad0 100644 --- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp @@ -171,7 +171,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * const DataLayout data_layout = input->info()->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); const unsigned int kernel_width = weights->info()->dimension(idx_width); @@ -193,7 +192,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * ICLTensor *gemm_output_to_use = output; ICLTensor *gemm_output_staged_to_use = output; - const unsigned bias_element = (_append_bias && !_skip_im2col) ? 1 : 0; const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr; // Get parameters from conv_info @@ -212,7 +210,6 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * dilation); unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element; // _weights_reshaped will be auto configured in the kernel. // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM @@ -223,25 +220,13 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor * // Create tensor to store im2col reshaped inputs if(!_skip_im2col) { - // Calculate im2col shape - // For OpenCL the batch size is on the third dimension - // TODO (giaiod01): Use auto-init COMPMID-1277 - TensorShape shape_im2col = input->info()->tensor_shape(); - if(shape_im2col.num_dimensions() >= 3) - { - shape_im2col.remove_dimension(2); - } - shape_im2col.set(0, mat_weights_rows); - shape_im2col.set(1, conv_w * conv_h); - - // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. - TensorInfo im2col_reshaped_info(shape_im2col, 1, data_type); - im2col_reshaped_info.set_quantization_info(input->info()->quantization_info()); - _im2col_output.allocator()->init(im2col_reshaped_info); _memory_group.manage(&_im2col_output); - // Configure and tune im2col + // Configure and tune im2col. im2col output shape is auto-initialized _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, dilation); + + // Set quantization info + _im2col_output.info()->set_quantization_info(input->info()->quantization_info()); CLScheduler::get().tune_kernel_static(_im2col_kernel); // Update GEMM input @@ -350,11 +335,10 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const ITensorInfo *gemm_output_staged_to_use = output; const ITensorInfo *weights_to_use = weights; - const bool is_nhwc = data_layout == DataLayout::NHWC; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized; - const bool append_bias = (biases != nullptr) && (!is_quantized); - const unsigned bias_element = (append_bias && !skip_im2col) ? 1 : 0; + const bool is_nhwc = data_layout == DataLayout::NHWC; + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized; + const bool append_bias = (biases != nullptr) && (!is_quantized); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); @@ -391,7 +375,6 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI dilation); unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel) + bias_element; // Output tensor auto inizialitation if not yet initialized ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayerReshapeWeights::validate(weights, is_quantized ? nullptr : biases, nullptr)); @@ -400,19 +383,14 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI if(!skip_im2col) { - // Create tensor info for im2col reshaped inputs - // For OpenCL the batch size is on the third dimension - // TODO (giaiod01): Use auto-init COMPMID-1277 - TensorShape shape_im2col = input->tensor_shape(); - if(input->tensor_shape().num_dimensions() >= 3) - { - shape_im2col.remove_dimension(2); - } - shape_im2col.set(0, mat_weights_rows); - shape_im2col.set(1, conv_w * conv_h); - im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); - im2col_reshaped_info.set_quantization_info(input->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation)); + const Size2D kernel_dims(kernel_width, kernel_height); + + // Output tensor auto initialization if not yet initialized + TensorShape expected_output_shape = compute_im2col_conv_shape(input, kernel_dims, conv_info, append_bias, dilation, true); + + auto_init_if_empty(im2col_reshaped_info, input->clone()->set_tensor_shape(expected_output_shape)); + + ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_reshaped_info, kernel_dims, conv_info, append_bias, dilation)); gemm_input_to_use = &im2col_reshaped_info; } else if(append_bias) diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 25b8adc431..c2f0283d4e 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -113,7 +113,7 @@ void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITenso // If the fully connected layer is called after a convolution layer, the input tensor must be linearized // Initialize output tensor for im2col - TensorShape shape_im2col = compute_im2col_fc_shape(input->info()); + TensorShape shape_im2col = compute_flatten_shape(input->info()); _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col)); // Configure im2col kernel @@ -249,7 +249,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn bool is_fc_after_conv = true; bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); - const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input))); + const ITensorInfo &im2col_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input))); const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); const ITensorInfo &gemmlowp_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); @@ -420,4 +420,4 @@ void NEFullyConnectedLayer::prepare() _is_prepared = true; } -} +} \ No newline at end of file diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index c0a5d0a436..df4a040bad 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -223,7 +223,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig { // Calculate im2col shape // For NEON the batch size is on the fourth dimension - // TODO (giaiod01): Use auto-init COMPMID-1277 + // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482 TensorShape shape_im2col = input->info()->tensor_shape(); shape_im2col.set(0, mat_weights_rows); shape_im2col.set(1, conv_w * conv_h); @@ -232,7 +232,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col)); _memory_group.manage(&_im2col_output); - // Configure and tune im2col + // Configure _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, _append_bias, false, false, dilation); // Update GEMM input diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index 937bd08c6f..4a6d778f9f 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -160,14 +160,14 @@ public: : ShapeDataset("Shape", { // Batch size 1 - TensorShape{ 9U, 9U }, + TensorShape{ 11U, 11U }, TensorShape{ 27U, 13U, 2U }, TensorShape{ 128U, 64U, 1U, 3U }, // Batch size 4 - TensorShape{ 9U, 9U, 3U, 4U }, + TensorShape{ 11U, 11U, 3U, 4U }, TensorShape{ 27U, 13U, 2U, 4U }, // Arbitrary batch size - TensorShape{ 9U, 9U, 3U, 5U } + TensorShape{ 11U, 11U, 3U, 5U } }) { } diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp index 9422fcc49b..291befa6cb 100644 --- a/tests/validation/CL/Im2Col.cpp +++ b/tests/validation/CL/Im2Col.cpp @@ -41,8 +41,18 @@ namespace validation { namespace { -const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) }); -const auto padstrides = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) }); +// *INDENT-OFF* +// clang-format off +const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), + Size2D(5U, 5U), + Size2D(3U, 1U), + Size2D(1U, 3U), + Size2D(5U, 3U), + Size2D(1U, 1U), + Size2D(11U, 11U)} ); +const auto padstrides = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), + PadStrideInfo(1U, 1U, 1U, 1U), + PadStrideInfo(2U, 2U, 0U, 2U) }); const auto conv_args = combine(combine(combine(conv_filter_sizes, padstrides), framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })); @@ -53,23 +63,19 @@ TEST_SUITE(Im2Col) using CLIm2Col = CLSynthetizeFunction; -// *INDENT-OFF* -// clang-format off DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("InputInfo", { TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::U8), // Unsupported data type TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::F32), // Mismatching data type TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8), // Bias not supported with QASYMM8 - TensorInfo(TensorShape(10U, 12U, 2U), 1, DataType::QASYMM8), // Mismatching shapes TensorInfo(TensorShape(10U, 12U, 2U, 2U), 1, DataType::QASYMM8), }), framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::F16), TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::F16), TensorInfo(TensorShape(3U, 3U, 10U, 2U), 1, DataType::QASYMM8), - TensorInfo(TensorShape(3U, 4U, 10U, 2U), 1, DataType::QASYMM8), - TensorInfo(TensorShape(18U, 80U, 1U, 2U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(18U, 80U, 2U, 1U), 1, DataType::QASYMM8), })), - framework::dataset::make("HasBias", { true, true, true, false, false })), - framework::dataset::make("Expected", { false, false, false, true, true })), + framework::dataset::make("HasBias", { true, true, true, false })), + framework::dataset::make("Expected", { false, false, false, true })), input_info, output_info, has_bias, expected) { @@ -83,16 +89,18 @@ template using CLIm2ColFixture = Im2ColValidationFixture; TEST_SUITE(Float) TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) { // Validate output validate(CLAccessor(_target), _reference); } TEST_SUITE_END() -FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) { // Validate output validate(CLAccessor(_target), _reference); @@ -101,14 +109,16 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode: #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) { // Validate output validate(CLAccessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) { // Validate output validate(CLAccessor(_target), _reference); @@ -120,14 +130,16 @@ TEST_SUITE_END() TEST_SUITE_END() TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", true))) { // Validate output validate(CLAccessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(CLAccessor(_target), _reference); diff --git a/tests/validation/CL/LocallyConnected.cpp b/tests/validation/CL/LocallyConnected.cpp index 5381072131..dbfe4e269f 100644 --- a/tests/validation/CL/LocallyConnected.cpp +++ b/tests/validation/CL/LocallyConnected.cpp @@ -59,6 +59,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/bias TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Mismatching shape input/output TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Asymmetric padding + TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32), // Padding required TensorInfo(TensorShape(23U, 27U, 5U), 1, DataType::F32) }), framework::dataset::make("WeightsInfo",{ TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F16), @@ -68,7 +69,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32), TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32), TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32), - TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32) + TensorInfo(TensorShape(3U, 3U, 5U, 21U, 275U), 1, DataType::F32), + TensorInfo(TensorShape(1U, 3U, 5U, 21U, 575U), 1, DataType::F32) })), framework::dataset::make("BiasInfo", { TensorInfo(TensorShape(21U, 275U), 1, DataType::F32), TensorInfo(TensorShape(21U, 275U), 1, DataType::F16), @@ -77,7 +79,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( TensorInfo(TensorShape(21U, 274U), 1, DataType::F32), TensorInfo(TensorShape(21U, 275U), 1, DataType::F32), TensorInfo(TensorShape(21U, 275U), 1, DataType::F32), - TensorInfo(TensorShape(21U, 275U), 1, DataType::F32) + TensorInfo(TensorShape(21U, 275U), 1, DataType::F32), + TensorInfo(TensorShape(21U, 575U), 1, DataType::F32) })), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32), TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32), @@ -86,7 +89,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32), TensorInfo(TensorShape(11U, 25U, 22U), 1, DataType::F32), TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32), - TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32) + TensorInfo(TensorShape(11U, 25U, 21U), 1, DataType::F32), + TensorInfo(TensorShape(23U, 25U, 21U), 1, DataType::F32) })), framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), @@ -94,10 +98,11 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( PadStrideInfo(2, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), - PadStrideInfo(2, 1, 1, 0, 0, 0, DimensionRoundingType::FLOOR), - PadStrideInfo(2, 1, 0, 0) + PadStrideInfo(2, 1, 1, 0), + PadStrideInfo(2, 1, 0, 0), + PadStrideInfo(1, 1, 0, 0) })), - framework::dataset::make("Expected", { false, false, false, false, false, false, false, true })), + framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true })), input_info, weights_info, bias_info, output_info, conv_info, expected) { bool is_valid = bool(CLLocallyConnectedLayer::validate(&input_info.clone()->set_is_resizable(false), diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp index bff8634f8f..f011ebe935 100644 --- a/tests/validation/NEON/Im2Col.cpp +++ b/tests/validation/NEON/Im2Col.cpp @@ -77,14 +77,16 @@ using NEIm2ColFixture = Im2ColValidationFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(Accessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(Accessor(_target), _reference); @@ -94,14 +96,16 @@ TEST_SUITE_END() #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(Accessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(Accessor(_target), _reference); @@ -113,14 +117,16 @@ TEST_SUITE_END() TEST_SUITE_END() TEST_SUITE(QASYMM8) -FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(Accessor(_target), _reference); } -FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), - conv_args)) +FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), + conv_args), + framework::dataset::make("ChannelsFirstOutputNHWC", false))) { // Validate output validate(Accessor(_target), _reference); diff --git a/tests/validation/fixtures/FlattenLayerFixture.h b/tests/validation/fixtures/FlattenLayerFixture.h index f273e9315c..d17080695b 100644 --- a/tests/validation/fixtures/FlattenLayerFixture.h +++ b/tests/validation/fixtures/FlattenLayerFixture.h @@ -55,7 +55,7 @@ public: { TensorShape shape_flatten; TensorInfo input_info(shape, 1, data_type); - shape_flatten = compute_im2col_flatten_shape(&input_info); + shape_flatten = compute_flatten_shape(&input_info); _target = compute_target(shape, shape_flatten, data_type); _reference = compute_reference(shape, shape_flatten, data_type); diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h index f72e38fefc..da2576b37c 100644 --- a/tests/validation/fixtures/Im2ColFixture.h +++ b/tests/validation/fixtures/Im2ColFixture.h @@ -49,7 +49,8 @@ class Im2ColValidationFixture : public framework::Fixture { public: template - void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout) + void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout, + bool channels_first_output_nhwc) { _kernel_dims = kernel_dims; _conv_info = conv_info; @@ -68,7 +69,7 @@ public: const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z); _target = compute_target(input_shape, output_shape, data_type); - compute_reference(input_shape, output_shape, data_type); + compute_reference(input_shape, output_shape, data_type, channels_first_output_nhwc); } protected: @@ -107,14 +108,16 @@ protected: return dst; } - void compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type) + void compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type, bool channels_first_output_nhwc) { // Create reference SimpleTensor src{ input_shape, data_type, 1, _quant_info, _data_layout }; _reference = SimpleTensor(output_shape, data_type, 1, _quant_info, DataLayout::NCHW); + // Fill reference fill(src); - reference::im2col(src, _reference, _kernel_dims, _conv_info, _has_bias); + + reference::im2col(src, _reference, _kernel_dims, _conv_info, _has_bias, channels_first_output_nhwc); } TensorType _target{}; SimpleTensor _reference{}; diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp index 83ef8b40a5..2459499474 100644 --- a/tests/validation/reference/Im2Col.cpp +++ b/tests/validation/reference/Im2Col.cpp @@ -23,8 +23,6 @@ */ #include "Im2Col.h" -#include "Permute.h" - #include "arm_compute/core/Types.h" #include "tests/validation/Helpers.h" #include "tests/validation/reference/Utils.h" @@ -41,46 +39,45 @@ template void im2col_nchw(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) { ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW); - // Create reference - const int pad_x = conv_info.pad().first; - const int pad_y = conv_info.pad().second; const int stride_x = conv_info.stride().first; const int stride_y = conv_info.stride().second; const int kernel_width = kernel_dims.width; const int kernel_height = kernel_dims.height; + const int pad_x = conv_info.pad().first; + const int pad_y = conv_info.pad().second; const int src_width = src.shape().x(); const int src_height = src.shape().y(); - const int src_depth = src.shape().z(); + const int src_channels = src.shape().z(); const int batches = src.shape().total_size_upper(3); + const int dst_height = dst.shape().y(); const int pad_val = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0; + int dst_idx = 0; - int dst_idx = 0; - // dst[dst_idx++] will write out of bounds if kernel_height == kernel_width == 1 because lasty will be the bottom padding row - // and this is not present in the dst buffer - const int lasty = src_height + (kernel_height > 1 ? pad_y : 0) - kernel_height; - const int lastx = src_width + (kernel_width > 1 ? pad_x : 0) - kernel_width; + // Compute width and height of the convolved tensors + std::pair convolved_dims = scaled_dimensions(src_width, src_height, kernel_dims.width, kernel_dims.height, conv_info); for(int b = 0; b < batches; ++b) { - for(int y = -pad_y; y <= lasty; y += stride_y) + for(int yo = 0; yo < dst_height; ++yo) { - for(int x = -pad_x; x <= lastx; x += stride_x) + // Compute input spatial coordinates + const int xi = (yo % convolved_dims.first) * stride_x; + const int yi = (yo / convolved_dims.first) * stride_y; + + for(int ci = 0; ci < src_channels; ++ci) { - for(int z = 0; z < src_depth; ++z) + for(int yk = 0; yk < kernel_height; ++yk) { - for(int patch_y = y; patch_y < (y + kernel_height); ++patch_y) + for(int xk = 0; xk < kernel_width; ++xk) { - for(int patch_x = x; patch_x < (x + kernel_width); ++patch_x) - { - dst[dst_idx++] = tensor_elem_at(src, Coordinates(patch_x, patch_y, z, b), BorderMode::CONSTANT, static_cast(pad_val)); - } + dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast(pad_val)); } } + } - if(has_bias) - { - dst[dst_idx++] = static_cast(1); - } + if(has_bias) + { + dst[dst_idx++] = static_cast(1); } } } @@ -133,7 +130,56 @@ void im2col_nhwc(const SimpleTensor &src, SimpleTensor &dst, const Size2D } template -void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +void im2col_nhwc_channel_first(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NHWC); + const int stride_x = conv_info.stride().first; + const int stride_y = conv_info.stride().second; + const int kernel_width = kernel_dims.width; + const int kernel_height = kernel_dims.height; + const int pad_x = conv_info.pad().first; + const int pad_y = conv_info.pad().second; + const int src_width = src.shape().y(); + const int src_height = src.shape().z(); + const int src_channels = src.shape().x(); + const int batches = src.shape().total_size_upper(3); + const int dst_width = has_bias ? dst.shape().x() - 1 : dst.shape().x(); + const int dst_height = dst.shape().y(); + const int pad_val = is_data_type_quantized_asymmetric(src.data_type()) ? src.quantization_info().offset : 0; + + // Compute width and height of the convolved tensors + std::pair convolved_dims = scaled_dimensions(src_width, src_height, kernel_dims.width, kernel_dims.height, conv_info); + + for(int b = 0; b < batches; ++b) + { + for(int yo = 0; yo < dst_height; ++yo) + { + // Compute input spatial coordinates + const int xi = (yo % convolved_dims.first) * stride_x; + const int yi = (yo / convolved_dims.first) * stride_y; + + for(int ci = 0; ci < src_channels; ++ci) + { + for(int yk = 0; yk < kernel_height; ++yk) + { + for(int xk = 0; xk < kernel_width; ++xk) + { + dst[ci + (xk + yk * kernel_width) * src_channels + yo * dst.shape().x() + b * dst.shape().x() * dst.shape().y()] = tensor_elem_at(src, Coordinates(ci, xi + xk - pad_x, yi + yk - pad_y, b), + BorderMode::CONSTANT, static_cast(pad_val)); + } + } + } + + if(has_bias) + { + dst[dst_width + yo * dst.shape().x() + b * dst.shape().x() * dst.shape().y()] = static_cast(1); + } + } + } +} + +template +void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc) { switch(src.data_layout()) { @@ -144,7 +190,14 @@ void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kern } case DataLayout::NHWC: { - im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias); + if(channels_first_output_nhwc) + { + im2col_nhwc_channel_first(src, dst, kernel_dims, conv_info, has_bias); + } + else + { + im2col_nhwc(src, dst, kernel_dims, conv_info, has_bias); + } break; } default: @@ -155,9 +208,9 @@ void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kern } } -template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); -template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); -template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); +template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc); +template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc); +template void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h index 5277171a2f..b1ebaf25da 100644 --- a/tests/validation/reference/Im2Col.h +++ b/tests/validation/reference/Im2Col.h @@ -35,7 +35,7 @@ namespace validation namespace reference { template -void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); +void im2col(const SimpleTensor &src, SimpleTensor &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc = false); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1