From d2fab7315bac3a586f2f1b1c8d64f2441f89ca64 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 2 Mar 2018 11:18:12 +0000 Subject: COMPMID-935 - Implementing Convolution with Winograd on OpenCL (part 4) Implemented Winograd Output Transform (2x2,3x3) on OpenCL Implemented CLWinogradConvolutionLayer on OpenCL Change-Id: I6a113fc5f052ca07f878d2b800d2ab003f84af65 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125148 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- arm_compute/core/CL/CLKernels.h | 1 + .../core/CL/kernels/CLGEMMMatrixMultiplyKernel.h | 1 + .../CL/kernels/CLWinogradOutputTransformKernel.h | 81 +++++++ arm_compute/core/utils/misc/ShapeCalculator.h | 40 +++- arm_compute/runtime/CL/CLFunctions.h | 1 + .../CL/functions/CLWinogradConvolutionLayer.h | 97 ++++++++ src/core/CL/CLKernelLibrary.cpp | 1 + src/core/CL/cl_kernels/gemm.cl | 127 +++++++++-- src/core/CL/cl_kernels/winograd.cl | 247 +++++++++++++++------ src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 21 +- src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp | 7 +- .../CL/kernels/CLWinogradFilterTransformKernel.cpp | 2 +- .../CL/kernels/CLWinogradInputTransformKernel.cpp | 7 +- .../CL/kernels/CLWinogradOutputTransformKernel.cpp | 188 ++++++++++++++++ src/runtime/CL/functions/CLGEMM.cpp | 14 +- .../CL/functions/CLWinogradConvolutionLayer.cpp | 146 ++++++++++++ .../CL/functions/CLWinogradInputTransform.cpp | 2 +- tests/datasets/LargeConvolutionLayerDataset.h | 24 +- tests/datasets/SmallConvolutionLayerDataset.h | 12 +- tests/datasets/WinogradOutputTransformDataset.h | 153 +++++++++++++ tests/validation/CL/Winograd.cpp | 179 ++++++++++++++- tests/validation/NEON/ConvolutionLayer.cpp | 6 +- tests/validation/fixtures/WinogradLayerFixture.h | 120 ++++++++-- tests/validation/reference/ConvolutionLayer.cpp | 2 +- tests/validation/reference/Winograd.cpp | 218 ++++++++++++++---- tests/validation/reference/Winograd.h | 3 + 26 files changed, 1506 insertions(+), 194 deletions(-) create mode 100644 arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h create mode 100644 arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h create mode 100644 src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp create mode 100644 src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp create mode 100644 tests/datasets/WinogradOutputTransformDataset.h diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index ef629c2e81..6f5c61523f 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -111,5 +111,6 @@ #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h" #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h" #include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h" +#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h" #endif /* __ARM_COMPUTE_CLKERNELS_H__ */ diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h index 7260c4a4f6..ee7e7c0e97 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h @@ -84,6 +84,7 @@ private: const ICLTensor *_input0; const ICLTensor *_input1; ICLTensor *_output; + bool _slide_matrix_b; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h new file mode 100644 index 0000000000..35117c65db --- /dev/null +++ b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__ +#define __ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the Winograd output transform kernel. */ +class CLWinogradOutputTransformKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLWinogradOutputTransformKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete; + /** Allow instances of this class to be moved */ + CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default; + /** Allow instances of this class to be moved */ + CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default; + /** Default destructor */ + ~CLWinogradOutputTransformKernel() = default; + /** Set the input and output tensor. + * + * @param[in] input Source tensor with shape [C, N, 16, batches]. Data types supported: F32. + * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input + * @param[out] output Destination tensor with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input + * @param[in] kernel_dims Kernel dimensions (Width and height). Currently only supported 3x3 kernels + * @param[in] output_convolved_dims Output dimensions after the convolution (Width and height) + * @param[in] num_tiles Number of tiles of size 2x2 in the output tensor along the X and Y direction + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles); + /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel + * + * @param[in] input Source tensor with shape [C, N, 16, batches]. Data types supported: F32. + * @param[in] bias Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input + * @param[out] output Destination tensor with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input + * @param[in] kernel_dims Kernel dimensions (Width and height). Currently only supported 3x3 kernels + * @param[in] output_convolved_dims Output dimensions after the convolution (Width and height) + * @param[in] num_tiles Number of tiles of size 2x2 in the output tensor along the X and Y direction + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_bias; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H__ */ diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 1e90927a93..5344ce7e74 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -28,6 +28,8 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Utils.h" +#include + namespace arm_compute { namespace misc @@ -233,19 +235,45 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp return output_shape; } + +inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &input, const Size2D &output_convolved_dims, DataLayout data_layout) +{ + TensorShape tensor_shape{ input.tensor_shape() }; + + // Output dimension + const unsigned int out_w = output_convolved_dims.width; + const unsigned int out_h = output_convolved_dims.height; + const unsigned int out_c = input.dimension(0); + + tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH), out_w); + tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT), out_h); + tensor_shape.set(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL), out_c); + + return tensor_shape; +} + inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info) { const TensorShape input_shape{ input.tensor_shape() }; const TensorShape weights_shape{ weights.tensor_shape() }; - unsigned int output_width = 0; - unsigned int output_height = 0; - std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info); + const size_t idx_width = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL); + + const unsigned int input_width = input_shape[idx_width]; + const unsigned int input_height = input_shape[idx_height]; + const unsigned int weights_width = weights_shape[idx_width]; + const unsigned int weights_height = weights_shape[idx_height]; + const unsigned int weights_channel = weights_shape[idx_channel]; + unsigned int output_width = 0; + unsigned int output_height = 0; + std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info); TensorShape output_shape{ input_shape }; - output_shape.set(0, output_width); - output_shape.set(1, output_height); - output_shape.set(2, weights_shape[3]); + output_shape.set(idx_width, output_width); + output_shape.set(idx_height, output_height); + output_shape.set(idx_channel, weights_channel); return output_shape; } diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 7c2377aa09..adf240e642 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -107,6 +107,7 @@ #include "arm_compute/runtime/CL/functions/CLTranspose.h" #include "arm_compute/runtime/CL/functions/CLWarpAffine.h" #include "arm_compute/runtime/CL/functions/CLWarpPerspective.h" +#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" #endif /* __ARM_COMPUTE_CLFUNCTIONS_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h new file mode 100644 index 0000000000..14de169236 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__ +#define __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__ + +#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h" +#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/functions/CLGEMM.h" +#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: + * + * -# @ref CLWinogradInputTransform + * -# @ref CLWinogradFilterTransformKernel (only once) + * -# @ref CLGEMM + * -# @ref CLWinogradOutputTransformKernel + * + */ +class CLWinogradConvolutionLayer : public IFunction +{ +public: + /** Default constructor */ + CLWinogradConvolutionLayer(std::shared_ptr memory_manager = nullptr); + /** Set the input and output tensors. + * + * @note: This function only works with 3x3 kernels and unit strides + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer + * + * @note: This function only works with 3x3 kernels and unit strides + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run() override; + +private: + CLMemoryGroup _memory_group; + CLGEMM _batched_mm; + CLWinogradInputTransform _input_transform; + CLWinogradFilterTransformKernel _filter_transform; + CLWinogradOutputTransformKernel _output_transform; + CLTensor _input0; + CLTensor _input1; + CLTensor _batched_mm_output; + bool _is_first_run; +}; +} +#endif /* __ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H__ */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 4b7fa8a3b3..9df2dcbacd 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -354,6 +354,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "winograd_filter_transform_2x2_3x3_nchw", "winograd.cl" }, { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl" }, { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl" }, + { "winograd_output_transform_2x2_3x3_nchw", "winograd.cl" }, { "YUYV422_to_IYUV_bt709", "color_convert.cl" }, { "YUYV422_to_NV12_bt709", "color_convert.cl" }, { "YUYV422_to_RGB888_bt709", "color_convert.cl" }, diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index cba5eea437..a5b0acbe9c 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -162,6 +162,8 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src), * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -199,8 +201,18 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global float *src_addr_a = (__global float *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes); - __global float *src_addr_b = (__global float *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes); + int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes); // Compute end row address for matrix B __global float *src_end_addr_b = src_addr_b + COLS_B; @@ -277,6 +289,9 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -314,8 +329,18 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global float *src_addr_a = (__global float *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes); - __global float *src_addr_b = (__global float *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes); + int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes); // Compute end row address for matrix B __global float *src_end_addr_b = src_addr_b + COLS_B; @@ -510,6 +535,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -547,8 +574,18 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global half *src_addr_a = (__global half *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes); - __global half *src_addr_b = (__global half *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes); + int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); // Compute end row address for matrix B __global half *src_end_addr_b = src_addr_b + COLS_B; @@ -627,8 +664,9 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) - * - * @note: ALPHA must be passed in 8 bit fixed point format + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) + * @note:ALPHA must be passed in 8 bit fixed point format * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -666,8 +704,18 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global char *src_addr_a = src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; - __global char *src_addr_b = src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes; + int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global char *src_addr_a = (__global char *)(src0_ptr + src0_addr_in_bytes); + __global char *src_addr_b = (__global char *)(src1_ptr + src1_addr_in_bytes); // Compute end row address for matrix B __global char *src_end_addr_b = src_addr_b + COLS_B; @@ -738,8 +786,9 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) - * - * @note: ALPHA must be passed in 16 bit fixed point format + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) + * @note:ALPHA must be passed in 16 bit fixed point format * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -777,8 +826,18 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global short *src_addr_a = (__global short *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes); - __global short *src_addr_b = (__global short *)(src1_ptr + z * src1_stride_z + x * src1_stride_y + src1_offset_first_element_in_bytes); + int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global short *src_addr_a = (__global short *)(src0_ptr + src0_addr_in_bytes); + __global short *src_addr_b = (__global short *)(src1_ptr + src1_addr_in_bytes); // Compute end row address for matrix B __global short *src_end_addr_b = src_addr_b + COLS_B; @@ -845,6 +904,8 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -885,7 +946,13 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), // Add offset for batched GEMM src_addr.s0 += get_global_id(2) * src0_stride_z; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE)); @@ -1013,6 +1080,8 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4. * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -1054,8 +1123,12 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), // Add offset for batched GEMM src_addr.s0 += get_global_id(2) * src0_stride_z; - // For convolution layer we do not want to slide the matrix B along Z +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) // Address boundary for matrix A int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float)); @@ -1251,6 +1324,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2. * @note The number of matrix A columns must be passed at compile time using -DCOLS_A. * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f. + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -1293,8 +1368,12 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0), // Add offset for batched GEMM src_addr.s0 += get_global_id(2) * src0_stride_z; - // For convolution layer we do not want to slide the matrix B along Z +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) // Address boundary for the matrix A int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float)); @@ -1460,6 +1539,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0), * @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION * @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -1500,7 +1581,13 @@ __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0), // Add offset for batched GEMM src_addr.s0 += get_global_id(2) * src0_stride_z; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char)); @@ -1636,6 +1723,8 @@ __kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0), * @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION * @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) * * @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16 * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) @@ -1676,7 +1765,13 @@ __kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0), // Add offset for batched GEMM src_addr.s0 += get_global_id(2) * src0_stride_z; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) src_addr.s1 += get_global_id(2) * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short)); diff --git a/src/core/CL/cl_kernels/winograd.cl b/src/core/CL/cl_kernels/winograd.cl index 238e21a18a..25c129d0aa 100644 --- a/src/core/CL/cl_kernels/winograd.cl +++ b/src/core/CL/cl_kernels/winograd.cl @@ -23,8 +23,102 @@ */ #include "helpers.h" -#if defined(NUM_TILES_X) +#if defined(NUM_CHANNELS) + +/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2 + * + * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x2_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor + float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y)); + float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y)); + float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y)); + + // Transform the 3x3 tile in a 4x4 tile + float4 out0 = 0.0f; + float4 out1 = 0.0f; + float4 out2 = 0.0f; + float4 out3 = 0.0f; + + // Row 0 + out0.s0 = (w0.s0); + out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; + out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; + out0.s3 = (w0.s2); + + // Row 1 + out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; + out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; + + // Row 2 + out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; + out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; + out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; + out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; + + // Row 3 + out3.s0 = (w2.s0); + out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; + out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; + out3.s3 = (w2.s2); + int z = get_global_id(2); + int x0 = z / NUM_CHANNELS; // idx filter + int y0 = z % NUM_CHANNELS; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + + // Store the 16 values across the 16 channels + *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0; + *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1; + *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2; + *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s3; + *(__global float *)(dst_addr + 8 * dst_stride_z) = out2.s0; + *(__global float *)(dst_addr + 9 * dst_stride_z) = out2.s1; + *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2; + *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3; + *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0; + *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1; + *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2; + *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3; +} +#endif // defined(NUM_CHANNELS) + +#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) /** This OpenCL kernel computes the input transform when the kernel size is 3x3 and the output tile is 2x2 * * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5). @@ -205,13 +299,12 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z)); vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z)); } -#endif //defined(NUM_TILES_X) +#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) -#if defined(NUM_CHANNELS) - -/** This OpenCL kernel performs Winograd filter transform 3x3 when the data format is NCHW and the output tile is 2x2 +#if defined(NUM_TILES_X) +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2, the filter size 3x3 and the data format is NCHW * - * @note The number of channels must be passed at compile time using -DNUM_CHANNELS: e.g. -DNUM_CHANNELS=64 + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) @@ -220,8 +313,6 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) @@ -232,72 +323,84 @@ __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw( * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void winograd_filter_transform_2x2_3x3_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) +__kernel void winograd_output_transform_2x2_3x3_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst) +#if defined(HAS_BIAS) + , + VECTOR_DECLARATION(bias) +#endif // defined(HAS_BIAS) +) { - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, NUM_CHANNELS); - - const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); - - // Load the values from the input tensor - float3 w0 = vload3(0, (__global float *)(src_addr + 0 * src_stride_y)); - float3 w1 = vload3(0, (__global float *)(src_addr + 1 * src_stride_y)); - float3 w2 = vload3(0, (__global float *)(src_addr + 2 * src_stride_y)); - - // Transform the 3x3 tile in a 4x4 tile - float4 out0 = 0.0f; - float4 out1 = 0.0f; - float4 out2 = 0.0f; - float4 out3 = 0.0f; - - // Row 0 - out0.s0 = (w0.s0); - out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; - out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; - out0.s3 = (w0.s2); - - // Row 1 - out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; - out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; - out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; - out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; - - // Row 2 - out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; - out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; - out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; - out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; - - // Row 3 - out3.s0 = (w2.s0); - out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; - out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; - out3.s3 = (w2.s2); - - int z = get_global_id(2); - int x0 = z / NUM_CHANNELS; // idx filter - int y0 = z % NUM_CHANNELS; // idx channel + // Each thread stores a 2x2 tile + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + + const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0); + + // Load the values across the 16 channels to compose the 4x4 tile + float d00 = *((__global float *)(src_addr + 0 * src_stride_z)); + float d01 = *((__global float *)(src_addr + 1 * src_stride_z)); + float d02 = *((__global float *)(src_addr + 2 * src_stride_z)); + float d03 = *((__global float *)(src_addr + 3 * src_stride_z)); + + float d10 = *((__global float *)(src_addr + 4 * src_stride_z)); + float d11 = *((__global float *)(src_addr + 5 * src_stride_z)); + float d12 = *((__global float *)(src_addr + 6 * src_stride_z)); + float d13 = *((__global float *)(src_addr + 7 * src_stride_z)); + + float d20 = *((__global float *)(src_addr + 8 * src_stride_z)); + float d21 = *((__global float *)(src_addr + 9 * src_stride_z)); + float d22 = *((__global float *)(src_addr + 10 * src_stride_z)); + float d23 = *((__global float *)(src_addr + 11 * src_stride_z)); + + float d30 = *((__global float *)(src_addr + 12 * src_stride_z)); + float d31 = *((__global float *)(src_addr + 13 * src_stride_z)); + float d32 = *((__global float *)(src_addr + 14 * src_stride_z)); + float d33 = *((__global float *)(src_addr + 15 * src_stride_z)); + + // Compute the 2x2 output tile + float k0 = d01 + d11 + d21; + float k1 = d02 + d12 + d22; + float k2 = d11 - d21 - d31; + float k3 = d12 - d22 - d32; + + // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22 + // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23) + // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32) + // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33) + + float out00 = d10; + float out01 = -d13; + float out10 = d10; + float out11 = -d13; + + out00 += d00 + d20 + k0 + k1; + out01 += k0 - k1 - (d03 + d23); + out10 += -d20 - d30 + k2 + k3; + out11 += k2 - k3 + d23 + d33; + + int y_in = get_global_id(1); + int x_out = (y_in % NUM_TILES_X) * 2; + int y_out = (y_in / NUM_TILES_X) * 2; + int z_out = get_global_id(0); + +#if defined(HAS_BIAS) + // Add bias + Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias); + + float b = (float) * ((__global float *)(vector_offset(&bias, z_out))); + + out00 += (float)b; + out01 += (float)b; + out10 += (float)b; + out11 += (float)b; +#endif // defined(HAS_BIAS) // Get output address - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * dst_stride_x + y_out * dst_stride_y + z_out * dst_stride_z; - // Store the 16 values across the 16 channels - *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0; - *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1; - *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2; - *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3; - *(__global float *)(dst_addr + 4 * dst_stride_z) = out1.s0; - *(__global float *)(dst_addr + 5 * dst_stride_z) = out1.s1; - *(__global float *)(dst_addr + 6 * dst_stride_z) = out1.s2; - *(__global float *)(dst_addr + 7 * dst_stride_z) = out1.s3; - *(__global float *)(dst_addr + 8 * dst_stride_z) = out2.s0; - *(__global float *)(dst_addr + 9 * dst_stride_z) = out2.s1; - *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2; - *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3; - *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0; - *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1; - *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2; - *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3; + // Store the 2x2 output tile + vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y)); + vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y)); } -#endif // defined(NUM_CHANNELS) +#endif // defined(NUM_TILES_X) diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 9c69800928..7b785bb8da 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -55,6 +55,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3"); if(!is_interleaved_transposed) { @@ -174,7 +175,7 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu } // namespace CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr) + : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true) { } @@ -192,9 +193,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info)); - _input0 = input0; - _input1 = input1; - _output = output; + _input0 = input0; + _input1 = input1; + _output = output; + _slide_matrix_b = _input1->info()->num_dimensions() >= _input0->info()->num_dimensions(); const DataType data_type = input0->info()->data_type(); const int fp_pos = input0->info()->fixed_point_position(); @@ -257,6 +259,9 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen "-DALPHA=" + float_to_string_with_full_precision(alpha)); } + // Do not slide matrix B if _slide_matrix_b = false + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); + std::string kernel_name; if(is_interleaved_transposed) { @@ -365,7 +370,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(_input1->info()->num_dimensions() < 3) + if(!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -374,9 +379,9 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que add_2D_tensor_argument(idx, _input0, slice); add_2D_tensor_argument(idx, _input1, slice_b); add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[3])); - _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[3])); - _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[3])); + _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, _lws_hint); } while(window.slide_window_slice_3D(slice)); diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 5489fde818..f69a39e4ad 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -76,15 +76,18 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, input_access); // Configure window in case of configured output if(output->total_size() != 0) { AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); - window_changed = window_changed || update_window_and_padding(win, output_access); + window_changed = window_changed || update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape())); } + else + { + window_changed = window_changed || update_window_and_padding(win, input_access); + } // Collapse along the Z direction Window collapsed = win.collapse(win, Window::DimZ); diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp index 3dbbe157b2..655b82bf66 100644 --- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp @@ -76,7 +76,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1)); window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->valid_region()); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); Window win_collapsed = win.collapse(win, Window::DimZ); diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp index 72adb5f358..3b9350f9ba 100644 --- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp @@ -44,11 +44,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_dims.width != 3 || kernel_dims.height != 3, "Winograd input transform only supports 3x3 kernels"); ARM_COMPUTE_UNUSED(kernel_dims); - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, Size2D(3U, 3U)); - // Validate configured output if(output->total_size() != 0) { + const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, kernel_dims); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -151,7 +151,8 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &conv_info, const Size2D &kernel_dims) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON(validate_arguments(input, output, conv_info, kernel_dims)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_info, kernel_dims)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), conv_info, kernel_dims).first); return Status{}; } diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp new file mode 100644 index 0000000000..c9823275eb --- /dev/null +++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, const Size2D &num_tiles) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_dims.width != 3 || kernel_dims.height != 3, "Only 3x3 kernels are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast(std::sqrt(input->dimension(2))) != 4, "Only 2x2 output tile is supported"); + ARM_COMPUTE_UNUSED(kernel_dims); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + } + + // Checks performed when output is configured + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, output_convolved_dims, DataLayout::NCHW)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; + + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), 2), ceil_to_multiple(output->dimension(1), 2)); + + if(bias != nullptr) + { + AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1)); + window_changed = update_window_and_padding(win, input_access, bias_access, output_access); + } + else + { + window_changed = update_window_and_padding(win, input_access, output_access); + } + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLWinogradOutputTransformKernel::CLWinogradOutputTransformKernel() + : _input(nullptr), _bias(nullptr), _output(nullptr) +{ +} + +void CLWinogradOutputTransformKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, + const Size2D &num_tiles) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(kernel_dims); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input->info(), output_convolved_dims, DataLayout::NCHW))); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), kernel_dims, output_convolved_dims, num_tiles)); + + _input = input; + _bias = bias; + _output = output; + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option_if(_bias != nullptr, std::string("-DHAS_BIAS")); + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width)); + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("winograd_output_transform_2x2_3x3_nchw", build_opts.options())); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "winograd_output_transform_2x2_3x3"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); +} + +Status CLWinogradOutputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const Size2D &kernel_dims, const Size2D &output_convolved_dims, + const Size2D &num_tiles) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, (bias != nullptr ? bias->clone().get() : nullptr), output, kernel_dims, output_convolved_dims, num_tiles)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), output->clone().get()).first); + + return Status{}; +} + +void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup output slice + Window slice_out(slice); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + if(_bias != nullptr) + { + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + Window slice_biases; + slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape()); + add_1D_tensor_argument(idx1, _bias, slice_biases); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice_out); + enqueue(queue, *this, slice, _lws_hint); + } + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); +} \ No newline at end of file diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index a06d94c1f5..172facfa78 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -66,17 +66,21 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const ICLT ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); if(c != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(0) != output->dimension(0), "The C matrix must have the same number of rows as the output matrix"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c->info()->dimension(1) != output->dimension(1), "The C matrix must have the same number of columns as the output matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The matrix C must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The matrix C must have the same number of columns as the matrix B"); + } + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A"); } ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp new file mode 100644 index 0000000000..5081cbac4e --- /dev/null +++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +using namespace arm_compute; + +CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr memory_manager) + : _memory_group(memory_manager), _batched_mm(memory_manager), _input_transform(), _filter_transform(), _output_transform(), _input0(), _input1(), _batched_mm_output(), _is_first_run(true) +{ +} + +void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info) +{ + // TODO(COMPMID-1013): This part will be removed + // Get indeces for the width and height + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + + // Kernel size + const unsigned int kernel_w = weights->info()->tensor_shape()[idx_width]; + const unsigned int kernel_h = weights->info()->tensor_shape()[idx_height]; + + // Number of tiles along the X and Y direction + const unsigned int num_tiles_x = std::ceil((input->info()->tensor_shape().x() - (kernel_w - 1) + conv_info.pad_left() + conv_info.pad_right()) / 2.f); + const unsigned int num_tiles_y = std::ceil((input->info()->tensor_shape().y() - (kernel_h - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / 2.f); + + // Compute output shape + const TensorShape output_convolved_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info); + + // Manage intermediate tensors + _memory_group.manage(&_input0); + _memory_group.manage(&_batched_mm_output); + + // Do not manage _input1 as it contains the weights + + // Configure input transform + _input_transform.configure(input, &_input0, conv_info, Size2D(kernel_w, kernel_h)); + + // Configure filter transform + _filter_transform.configure(weights, &_input1); + + // Configure batched matrix multiply + _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)); + + // Configure output transform + _output_transform.configure(&_batched_mm_output, biases, output, Size2D(kernel_w, kernel_h), Size2D(output_convolved_shape[idx_width], output_convolved_shape[idx_height]), Size2D(num_tiles_x, + num_tiles_y)); + + // Allocate temporary tensors + _input0.allocator()->allocate(); + _input1.allocator()->allocate(); + _batched_mm_output.allocator()->allocate(); +} + +Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) +{ + // TODO(COMPMID-1013): This part will be removed + // Get indeces for the width and height + const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + + // Kernel size + const unsigned int kernel_w = weights->tensor_shape()[idx_width]; + const unsigned int kernel_h = weights->tensor_shape()[idx_height]; + + // Number of tiles along the X and Y direction + const unsigned int num_tiles_x = std::ceil((input->tensor_shape().x() - (kernel_w - 1) + conv_info.pad_left() + conv_info.pad_right()) / 2.f); + const unsigned int num_tiles_y = std::ceil((input->tensor_shape().y() - (kernel_h - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / 2.f); + + // Compute output shape + const TensorShape output_convolved_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info); + + // Validate input transform + const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, conv_info, Size2D(kernel_w, kernel_h)); + const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape); + ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransform::validate(input, &input0, conv_info, Size2D(kernel_w, kernel_h))); + + // Validate filter transform + const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights); + const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); + ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradFilterTransformKernel::validate(weights, &input1)); + + // Configure batched matrix multiply + TensorShape batched_mm_output_shape = input0.tensor_shape(); + batched_mm_output_shape[0] = input1.tensor_shape()[0]; + const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/))); + + // Configure output transform + ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, Size2D(kernel_w, kernel_h), Size2D(output_convolved_shape[idx_width], + output_convolved_shape[idx_height]), + Size2D(num_tiles_x, num_tiles_y))); + + return Status{}; +} + +void CLWinogradConvolutionLayer::run() +{ + if(_is_first_run) + { + // Run filter transform + CLScheduler::get().enqueue(_filter_transform, false); + + _is_first_run = false; + } + + _memory_group.acquire(); + + // Run input transform + _input_transform.run(); + + // Run batched matrix multiplication + _batched_mm.run(); + + // Run output transform + CLScheduler::get().enqueue(_output_transform); + + _memory_group.release(); +} diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp index 652f31ac74..0499d4cd2f 100644 --- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp +++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp @@ -40,6 +40,6 @@ void CLWinogradInputTransform::configure(ICLTensor *input, ICLTensor *output, co Status CLWinogradInputTransform::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &conv_info, const Size2D &kernel_dims) { - ARM_COMPUTE_RETURN_ERROR_ON(CLWinogradInputTransformKernel::validate(input, output, conv_info, kernel_dims)); + ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradInputTransformKernel::validate(input, output, conv_info, kernel_dims)); return Status{}; } diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h index 086b2e3def..ec8e09fa81 100644 --- a/tests/datasets/LargeConvolutionLayerDataset.h +++ b/tests/datasets/LargeConvolutionLayerDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,6 +37,28 @@ namespace test { namespace datasets { +class LargeWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset +{ +public: + LargeWinogradConvolutionLayer3x3Dataset() + { + // Kernel size 3 + // Batch size 1 + add_config(TensorShape(224U, 222U, 64U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U), PadStrideInfo(1, 1, 1, 1)); + add_config(TensorShape(112U, 113U, 64U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(112U, 113U, 128U), PadStrideInfo(1, 1, 1, 1)); + add_config(TensorShape(112U, 112U, 128U), TensorShape(3U, 3U, 128U, 129U), TensorShape(129U), TensorShape(112U, 110U, 129U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(53U, 56U, 125U), TensorShape(3U, 3U, 125U, 256U), TensorShape(256U), TensorShape(51U, 56U, 256U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(56U, 56U, 256U), TensorShape(3U, 3U, 256U, 256U), TensorShape(256U), TensorShape(56U, 54U, 256U), PadStrideInfo(1, 1, 1, 0)); + add_config(TensorShape(28U, 28U, 257U), TensorShape(3U, 3U, 257U, 512U), TensorShape(512U), TensorShape(26U, 28U, 512U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(28U, 28U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(28U, 28U, 512U), PadStrideInfo(1, 1, 1, 1)); + add_config(TensorShape(14U, 14U, 512U), TensorShape(3U, 3U, 512U, 512U), TensorShape(512U), TensorShape(12U, 12U, 512U), PadStrideInfo(1, 1, 0, 0)); + // Batch size 3, 2 and 4 + add_config(TensorShape(224U, 222U, 64U, 3U), TensorShape(3U, 3U, 64U, 64U), TensorShape(64U), TensorShape(224U, 222U, 64U, 3U), PadStrideInfo(1, 1, 1, 1)); + add_config(TensorShape(112U, 113U, 64U, 2U), TensorShape(3U, 3U, 64U, 128U), TensorShape(128U), TensorShape(110U, 113U, 128U, 2U), PadStrideInfo(1, 1, 0, 1)); + add_config(TensorShape(111U, 112U, 127U, 4U), TensorShape(3U, 3U, 127U, 128U), TensorShape(128U), TensorShape(111U, 112U, 128U, 4U), PadStrideInfo(1, 1, 1, 1)); + } +}; + class LargeConvolutionLayerDataset final : public ConvolutionLayerDataset { public: diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h index adb61de8e2..696c396eef 100644 --- a/tests/datasets/SmallConvolutionLayerDataset.h +++ b/tests/datasets/SmallConvolutionLayerDataset.h @@ -37,10 +37,10 @@ namespace test { namespace datasets { -class SmallWinogradLayerDataset final : public ConvolutionLayerDataset +class SmallWinogradConvolutionLayer3x3Dataset final : public ConvolutionLayerDataset { public: - SmallWinogradLayerDataset() + SmallWinogradConvolutionLayer3x3Dataset() { // Kernel size 3 // Batch size 1 @@ -48,8 +48,14 @@ public: // Batch size 4 add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(21U, 25U, 21U, 4U), PadStrideInfo(1, 1, 0, 0)); add_config(TensorShape(8U, 8U, 2U), TensorShape(3U, 3U, 2U, 1U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 1, 1)); + } +}; - // Kernel size 5 +class SmallWinogradConvolutionLayer5x5Dataset final : public ConvolutionLayerDataset +{ +public: + SmallWinogradConvolutionLayer5x5Dataset() + { add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U, 1U), TensorShape(1U), TensorShape(4U, 4U, 1U), PadStrideInfo(1, 1, 0, 0)); add_config(TensorShape(8U, 8U, 2U), TensorShape(5U, 5U, 2U), TensorShape(1U), TensorShape(8U, 8U, 1U), PadStrideInfo(1, 1, 2, 2)); } diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h new file mode 100644 index 0000000000..c42d6c8ebd --- /dev/null +++ b/tests/datasets/WinogradOutputTransformDataset.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET +#define ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET + +#include "utils/TypePrinter.h" + +#include "arm_compute/core/TensorShape.h" + +namespace arm_compute +{ +namespace test +{ +namespace datasets +{ +class WinogradOutputTransformDataset +{ +public: + using type = std::tuple; + + struct iterator + { + iterator(std::vector::const_iterator a_it, + std::vector::const_iterator b_it, + std::vector::const_iterator c_it, + std::vector::const_iterator d_it, + std::vector::const_iterator data_layout_it) + : _a_it{ std::move(a_it) }, + _b_it{ std::move(b_it) }, + _c_it{ std::move(c_it) }, + _d_it{ std::move(d_it) }, + _data_layout_it{ std::move(data_layout_it) } + { + } + + std::string description() const + { + std::stringstream description; + description << "Input=" << *_a_it << ":"; + description << "KernelDims=" << *_b_it << ":"; + description << "OutputDims=" << *_c_it << ":"; + description << "NumTiles=" << *_d_it << ":"; + description << "DataLayout=" << *_data_layout_it; + return description.str(); + } + + WinogradOutputTransformDataset::type operator*() const + { + return std::make_tuple(*_a_it, *_b_it, *_c_it, *_d_it, *_data_layout_it); + } + + iterator &operator++() + { + ++_a_it; + ++_b_it; + ++_c_it; + ++_d_it; + ++_data_layout_it; + + return *this; + } + + private: + std::vector::const_iterator _a_it; + std::vector::const_iterator _b_it; + std::vector::const_iterator _c_it; + std::vector::const_iterator _d_it; + std::vector::const_iterator _data_layout_it; + }; + + iterator begin() const + { + return iterator(_a_shapes.begin(), _b_dims.begin(), _c_dims.begin(), _d_dims.begin(), _data_layout.begin()); + } + + int size() const + { + return std::min(_a_shapes.size(), std::min(_b_dims.size(), std::min(_c_dims.size(), std::min(_d_dims.size(), _data_layout.size())))); + } + + void add_config(TensorShape a, Size2D b, Size2D c, Size2D d, DataLayout data_layout) + { + _a_shapes.emplace_back(std::move(a)); + _b_dims.emplace_back(std::move(b)); + _c_dims.emplace_back(std::move(c)); + _d_dims.emplace_back(std::move(d)); + _data_layout.emplace_back(std::move(data_layout)); + } + +protected: + WinogradOutputTransformDataset() = default; + WinogradOutputTransformDataset(WinogradOutputTransformDataset &&) = default; + +private: + std::vector _a_shapes{}; + std::vector _b_dims{}; + std::vector _c_dims{}; + std::vector _d_dims{}; + std::vector _data_layout{}; +}; + +class SmallWinogradOutputTransformDataset final : public WinogradOutputTransformDataset +{ +public: + SmallWinogradOutputTransformDataset() + { + add_config(TensorShape(24U, 49U, 16U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW); + add_config(TensorShape(13U, 6U, 16U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW); + add_config(TensorShape(7U, 20U, 16U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW); + add_config(TensorShape(24U, 49U, 16U, 3U), Size2D(3, 3), Size2D(14U, 14U), Size2D(7U, 7U), DataLayout::NCHW); + add_config(TensorShape(13U, 6U, 16U, 2U), Size2D(3, 3), Size2D(5U, 4U), Size2D(3U, 2U), DataLayout::NCHW); + add_config(TensorShape(7U, 20U, 16U, 5U), Size2D(3, 3), Size2D(8U, 9U), Size2D(4U, 5U), DataLayout::NCHW); + } +}; + +class LargeWinogradOutputTransformDataset final : public WinogradOutputTransformDataset +{ +public: + LargeWinogradOutputTransformDataset() + { + add_config(TensorShape(128U, 3136U, 16U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW); + add_config(TensorShape(256U, 784U, 16U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW); + add_config(TensorShape(512U, 169U, 16U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW); + add_config(TensorShape(128U, 3136U, 16U, 3U), Size2D(3, 3), Size2D(112U, 112U), Size2D(56U, 56U), DataLayout::NCHW); + add_config(TensorShape(256U, 784U, 16U, 2U), Size2D(3, 3), Size2D(55U, 55U), Size2D(28U, 28U), DataLayout::NCHW); + add_config(TensorShape(512U, 169U, 16U, 5U), Size2D(3, 3), Size2D(26U, 26U), Size2D(13U, 13U), DataLayout::NCHW); + } +}; +} // namespace datasets +} // namespace test +} // namespace arm_compute +#endif /* ARM_COMPUTE_TEST_WINOGRAD_OUTPUT_TRANSFORM_DATASET */ diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp index 0b21ed2577..aa668fa575 100644 --- a/tests/validation/CL/Winograd.cpp +++ b/tests/validation/CL/Winograd.cpp @@ -22,17 +22,22 @@ * SOFTWARE. */ #include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h" +#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h" #include "tests/CL/CLAccessor.h" #include "tests/CL/Helper.h" #include "tests/PaddingCalculator.h" +#include "tests/datasets/LargeConvolutionLayerDataset.h" #include "tests/datasets/ShapeDatasets.h" +#include "tests/datasets/SmallConvolutionLayerDataset.h" #include "tests/datasets/WinogradFilterTransformDataset.h" #include "tests/datasets/WinogradInputTransformDataset.h" +#include "tests/datasets/WinogradOutputTransformDataset.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" @@ -47,7 +52,7 @@ namespace validation { namespace { -constexpr AbsoluteTolerance tolerance_f32(0.0001f); +constexpr AbsoluteTolerance tolerance_f32(0.001f); } // namespace using namespace arm_compute::misc::shape_calculator; @@ -65,9 +70,9 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::QASYMM8), // QASYMM8 not supported TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32), // Kernel size not supported TensorInfo(TensorShape(53U, 21U, 5U, 3U), 1, DataType::F32), // Strides not supported - TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32), // valid - TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32), // valid - TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32) // valid + TensorInfo(TensorShape(53U, 33U, 4U), 1, DataType::F32), // Padding needed + TensorInfo(TensorShape(34U, 42U, 7U, 3U), 1, DataType::F32), // Padding needed + TensorInfo(TensorShape(31U, 37U, 37U), 1, DataType::F32) // Padding needed }), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(5U, 5U, 16U, 3U), 1, DataType::F16), @@ -96,7 +101,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip( Size2D(3U, 3U), Size2D(3U, 3U) })), - framework::dataset::make("Expected", { false, false, false, false, true, true, true })), + framework::dataset::make("Expected", { false, false, false, false, false, false, false })), input_info, output_info, conv_info, kernel_dims, expected) { ARM_COMPUTE_EXPECT(bool(CLWinogradInputTransform::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, kernel_dims)) == expected, framework::LogLevel::ERRORS); @@ -203,8 +208,172 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradFilterTransformFixture, framework::Da // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } + TEST_SUITE_END() // FilterTransform +TEST_SUITE(OutputTransform) +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputInfo",{ + TensorInfo(TensorShape(24U, 49U, 16U, 5U), 1, DataType::F16), // F16 not supported + TensorInfo(TensorShape(128U, 3136U, 16U, 5U), 1, DataType::QASYMM8), // QASYMM8 not supported + TensorInfo(TensorShape(256U, 784U, 16U, 5U), 1, DataType::F32), // Kernel size not supported + TensorInfo(TensorShape(512U, 169U, 16U, 5U), 1, DataType::F32), // Valid + TensorInfo(TensorShape(13U, 6U, 16U, 4U), 1, DataType::F32), // Padding needed + TensorInfo(TensorShape(7U, 16U, 16U, 7U), 1, DataType::F32), // Valid + TensorInfo(TensorShape(1U, 442U, 16U, 37U), 1, DataType::F32) // Wrong number of tiles + }), + framework::dataset::make("BiasInfo", { + TensorInfo(TensorShape(24U), 1, DataType::F16), + TensorInfo(TensorShape(128U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(256U), 1, DataType::F32), + TensorInfo(TensorShape(512U), 1, DataType::F32), + TensorInfo(TensorShape(13U), 1, DataType::F32), + TensorInfo(TensorShape(7U), 1, DataType::F32), + TensorInfo(TensorShape(1U), 1, DataType::F32) + })), + framework::dataset::make("OutputInfo", { + TensorInfo(TensorShape(14U, 14U, 24U, 5U), 1, DataType::F16), + TensorInfo(TensorShape(112U, 112U, 128U, 5U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(55U, 55U, 256U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(26U, 26U, 512U, 5U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 4U, 13U, 4U), 1, DataType::F32), + TensorInfo(TensorShape(8U, 8U, 7U, 7U), 1, DataType::F32), + TensorInfo(TensorShape(51U, 33U, 1U, 37U), 1, DataType::F32) + })), + framework::dataset::make("KernelDims", { + Size2D(3U, 3U), + Size2D(3U, 3U), + Size2D(5U, 5U), + Size2D(3U, 3U), + Size2D(3U, 3U), + Size2D(3U, 3U), + Size2D(3U, 3U) + })), + framework::dataset::make("OutputDims", { + Size2D(14U, 14U), + Size2D(112U, 112U), + Size2D(55U, 55U), + Size2D(26U, 26U), + Size2D(5U, 4U), + Size2D(8U, 8U), + Size2D(51U, 33U) + })), + framework::dataset::make("NumTiles", { + Size2D(7U, 7U), + Size2D(56U, 56U), + Size2D(28U, 28U), + Size2D(13U, 13U), + Size2D(3U, 2U), + Size2D(4U, 4U), + Size2D(26U, 16U) + })), + framework::dataset::make("Expected", { false, false, false, true, false, true, false })), + input_info, bias_info, output_info, kernel_dims, output_dims, num_tiles, expected) +{ + ARM_COMPUTE_EXPECT(bool(CLWinogradOutputTransformKernel::validate(&input_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), kernel_dims, output_dims, num_tiles)) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +using CLWinogradOutputTransform = CLSynthetizeFunctionWithZeroConstantBorder; +using CLWinogradOutputTransformFixture = WinogradOutputTransformValidationFixture; + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallWinogradOutputTransformDataset(), datasets::LargeWinogradOutputTransformDataset()), + framework::dataset::make("DataType", { DataType::F32 })), + shape_a, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type) +{ + TensorShape shape_b = compute_winograd_output_transform_shape(TensorInfo(shape_a, 1, data_type), output_convolved_dims, data_layout); + + // Create tensors + CLTensor a = create_tensor(shape_a, data_type); + CLTensor b = create_tensor(shape_b, data_type); + + ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + CLWinogradOutputTransform winograd_output_transform; + winograd_output_transform.configure(&a, nullptr, &b, kernel_dims, output_convolved_dims, num_tiles); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradOutputTransformFixture, framework::DatasetMode::ALL, combine(datasets::SmallWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradOutputTransformFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradOutputTransformDataset(), framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +TEST_SUITE_END() // OutputTransform + +TEST_SUITE(ConvolutionLayer) +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( + framework::dataset::make("InputInfo", { + TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F16), // FP16 not supported + TensorInfo(TensorShape(17U, 31U, 2U), 1, DataType::F32), // Datatype mismatch + TensorInfo(TensorShape(23U, 27U, 5U, 4U), 1, DataType::F32), // Stride y not supported + TensorInfo(TensorShape(16U, 16U, 8U), 1, DataType::F32), // Padding needed + TensorInfo(TensorShape(33U, 27U, 7U, 4U), 1, DataType::F32) // Kernel size not supported + }), + framework::dataset::make("WeightsInfo", { + TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::F32), + TensorInfo(TensorShape(3U, 3U, 2U, 19U), 1, DataType::QASYMM8), + TensorInfo(TensorShape(3U, 3U, 5U, 21U), 1, DataType::F32), + TensorInfo(TensorShape(3U, 3U, 8U, 16U), 1, DataType::F32), + TensorInfo(TensorShape(5U, 5U, 7U, 16U), 1, DataType::F16) + })), + framework::dataset::make("BiasesInfo", { + TensorInfo(TensorShape(19U), 1, DataType::F32), + TensorInfo(TensorShape(19U), 1, DataType::F32), + TensorInfo(TensorShape(21U), 1, DataType::F32), + TensorInfo(TensorShape(16U), 1, DataType::F32), + TensorInfo(TensorShape(16U), 1, DataType::F32) + })), + framework::dataset::make("OutputInfo", { + TensorInfo(TensorShape(17U, 31U, 19U), 1, DataType::F32), + TensorInfo(TensorShape(15U, 15U, 19U), 1, DataType::F32), + TensorInfo(TensorShape(21U, 25U, 21U, 4U), 1, DataType::F32), + TensorInfo(TensorShape(16U, 16U, 16U), 1, DataType::F32), + TensorInfo(TensorShape(11U, 12U, 16U, 4U), 1, DataType::F32) + })), + framework::dataset::make("ConvInfo", { + PadStrideInfo(1, 1, 1, 1), + PadStrideInfo(1, 1, 1, 1), + PadStrideInfo(1, 2, 0, 0), + PadStrideInfo(1, 1, 1, 1), + PadStrideInfo(1, 1, 1, 0) + })), + framework::dataset::make("Expected", { false, false, false, false, false })), + input_info, weights_info, bias_info, output_info, conv_info, expected) +{ + ARM_COMPUTE_EXPECT(bool(CLWinogradConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &bias_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info)) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +using CLWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture; +FIXTURE_DATA_TEST_CASE(RunSmall, CLWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallWinogradConvolutionLayer3x3Dataset(), + framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLWinogradConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeWinogradConvolutionLayer3x3Dataset(), framework::dataset::make("DataType", { DataType::F32 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // ConvolutionLayer + TEST_SUITE_END() // Winograd TEST_SUITE_END() // CL } // namespace validation diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 59db279ac7..34306b381c 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -109,10 +109,12 @@ TEST_SUITE_END() TEST_SUITE(WinogradLayer) template -using NEWinogradLayerFixture = WinogradLayerValidationFixture; +using NEWinogradConvolutionLayerFixture = WinogradConvolutionLayerValidationFixture; TEST_SUITE(FP32) -FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradLayerFixture, framework::DatasetMode::PRECOMMIT, datasets::SmallWinogradLayerDataset()) +FIXTURE_DATA_TEST_CASE(RunSmall, NEWinogradConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(framework::dataset::concat(datasets::SmallWinogradConvolutionLayer3x3Dataset(), + datasets::SmallWinogradConvolutionLayer5x5Dataset()), + framework::dataset::make("DataType", { DataType::F32 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); diff --git a/tests/validation/fixtures/WinogradLayerFixture.h b/tests/validation/fixtures/WinogradLayerFixture.h index bfe1efce3b..9811c28008 100644 --- a/tests/validation/fixtures/WinogradLayerFixture.h +++ b/tests/validation/fixtures/WinogradLayerFixture.h @@ -48,14 +48,14 @@ namespace validation using namespace arm_compute::misc::shape_calculator; template -class WinogradLayerValidationFixture : public framework::Fixture +class WinogradConvolutionLayerValidationFixture : public framework::Fixture { public: template - void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info) + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, DataType data_type) { - _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info); - _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info); + _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, data_type); + _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, data_type); } protected: @@ -79,13 +79,14 @@ protected: } } - TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info) + TensorType compute_target(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info, + DataType data_type) { // Create tensors - TensorType src = create_tensor(input_shape, DataType::F32, 1); - TensorType weights = create_tensor(weights_shape, DataType::F32, 1); - TensorType bias = create_tensor(bias_shape, DataType::F32, 1); - TensorType dst = create_tensor(output_shape, DataType::F32, 1); + TensorType src = create_tensor(input_shape, data_type, 1); + TensorType weights = create_tensor(weights_shape, data_type, 1); + TensorType bias = create_tensor(bias_shape, data_type, 1); + TensorType dst = create_tensor(output_shape, data_type, 1); // Create and configure function FunctionType conv; @@ -111,20 +112,20 @@ protected: fill(AccessorType(src), 0, -1.f, 1.f); fill(AccessorType(weights), 1, -1.f, 1.f); fill(AccessorType(bias), 2, -1.f, 1.f); - fill(AccessorType(dst), 3, -1.f, 1.f); - // Compute NEWinogradLayer function + // Compute Winograd Convolution function conv.run(); return dst; } - SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info) + SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, const PadStrideInfo &info, + DataType data_type) { // Create reference - SimpleTensor src{ input_shape, DataType::F32, 1 }; - SimpleTensor weights{ weights_shape, DataType::F32, 1 }; - SimpleTensor bias{ bias_shape, DataType::F32, 1 }; + SimpleTensor src{ input_shape, data_type, 1 }; + SimpleTensor weights{ weights_shape, data_type, 1 }; + SimpleTensor bias{ bias_shape, data_type, 1 }; // Fill reference fill(src, 0, -1.f, 1.f); @@ -136,8 +137,6 @@ protected: TensorType _target{}; SimpleTensor _reference{}; - int _fractional_bits{}; - DataType _data_type{}; }; template @@ -178,7 +177,6 @@ protected: { ARM_COMPUTE_UNUSED(is_nchw_format); - // Create tensors TensorType src = create_tensor(input_shape, data_type); TensorType dst = create_tensor(output_shape, data_type); @@ -261,8 +259,8 @@ protected: ARM_COMPUTE_UNUSED(is_nchw_format); // Create tensors - TensorType src = create_tensor(input_shape, data_type); - TensorType dst = create_tensor(output_shape, data_type); + TensorType src = create_tensor(input_shape, data_type, 1); + TensorType dst = create_tensor(output_shape, data_type, 1); // Create and configure function FunctionType filter_transform; @@ -288,7 +286,7 @@ protected: SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, bool is_nchw_format, DataType data_type) { - ARM_COMPUTE_ERROR_ON(!is_nchw_format); + ARM_COMPUTE_UNUSED(is_nchw_format); // Create reference SimpleTensor src{ input_shape, data_type, 1 }; @@ -302,6 +300,86 @@ protected: TensorType _target{}; SimpleTensor _reference{}; }; + +template +class WinogradOutputTransformValidationFixture : public framework::Fixture +{ +public: + template + void setup(TensorShape input_shape, Size2D kernel_dims, Size2D output_convolved_dims, Size2D num_tiles, DataLayout data_layout, DataType data_type) + { + TensorShape output_shape = compute_winograd_output_transform_shape(TensorInfo(input_shape, 1, data_type), output_convolved_dims, data_layout); + + _target = compute_target(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type); + _reference = compute_reference(input_shape, output_shape, kernel_dims, output_convolved_dims, num_tiles, data_layout, data_type); + } + +protected: + template + void fill(U &&tensor, int i, float min, float max) + { + switch(tensor.data_type()) + { + case DataType::F32: + { + std::uniform_real_distribution<> distribution(min, max); + library->fill(tensor, distribution, i); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + library->fill_tensor_uniform(tensor, i); + break; + } + } + } + + TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles, DataLayout data_layout, + DataType data_type) + { + // Create tensors + TensorType src = create_tensor(input_shape, data_type, 1, 0, QuantizationInfo(), data_layout); + TensorType dst = create_tensor(output_shape, data_type, 1, 0, QuantizationInfo(), data_layout); + + // Create and configure function + FunctionType output_transform; + output_transform.configure(&src, nullptr, &dst, kernel_dims, output_convolved_dims, num_tiles); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Fill tensors + fill(AccessorType(src), 0, -1.f, 1.f); + + output_transform.run(); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &output_convolved_dims, Size2D &num_tiles, + DataLayout data_layout, + DataType data_type) + { + // Create reference + SimpleTensor src{ input_shape, data_type, 1, 0, QuantizationInfo(), data_layout }; + + // Fill reference + fill(src, 0, -1.f, 1.f); + + return reference::winograd_output_transform(src, output_shape, kernel_dims, num_tiles); + } + + TensorType _target{}; + SimpleTensor _reference{}; +}; } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp index 24bbf32a30..f3db274935 100644 --- a/tests/validation/reference/ConvolutionLayer.cpp +++ b/tests/validation/reference/ConvolutionLayer.cpp @@ -118,4 +118,4 @@ template SimpleTensor convolution_layer(const SimpleTensor &sr } // namespace reference } // namespace validation } // namespace test -} // namespace arm_compute +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp index 3ed55fb9fc..c760663b22 100644 --- a/tests/validation/reference/Winograd.cpp +++ b/tests/validation/reference/Winograd.cpp @@ -38,6 +38,87 @@ namespace reference { namespace { +template +void winograd_filter_transform3x3(const SimpleTensor &in, SimpleTensor &out) +{ + // Simple tensor for the 3x3 input tile + SimpleTensor input_tile{ TensorShape(3u, 3u), in.data_type(), 1 }; + + // Simple tensor for the transformation matrix + SimpleTensor trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 }; + + // Simple tensor for the transformation matrix transpose + SimpleTensor trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 }; + + // Simple tensor for the 4x3 temporary tile + SimpleTensor tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 }; + + // Simple tensor for the 4x4 output tile + SimpleTensor output_tile{ TensorShape(4u, 4u), in.data_type(), 1 }; + + // Initialize transformation matrix + // 1 | 0 | 0 + // 0.5 | 0.5 | 0.5 + // 0.5 |-0.5 | 0.5 + // 0 | 0 | 1 + trans_matrix[0 + 0 * 3] = 1.0f; + trans_matrix[1 + 0 * 3] = 0.0f; + trans_matrix[2 + 0 * 3] = 0.0f; + trans_matrix[0 + 1 * 3] = 0.5f; + trans_matrix[1 + 1 * 3] = 0.5f; + trans_matrix[2 + 1 * 3] = 0.5f; + trans_matrix[0 + 2 * 3] = 0.5f; + trans_matrix[1 + 2 * 3] = -0.5f; + trans_matrix[2 + 2 * 3] = 0.5f; + trans_matrix[0 + 3 * 3] = 0.0f; + trans_matrix[1 + 3 * 3] = 0.0f; + trans_matrix[2 + 3 * 3] = 1.0f; + + // Transpose the transformation matrix + transpose_matrix(trans_matrix, trans_matrix_transposed); + + const int num_channels = in.shape()[2]; + const int num_filters = in.shape()[3]; + const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters); + + for(int n = 0; n < num_batches; ++n) + { + for(int w = 0; w < num_filters; ++w) + { + for(int z = 0; z < num_channels; ++z) + { + // Load the 3x3 tile from the input tensor + get_tile(in, input_tile, Coordinates(0, 0, z, w, n)); + + // First transformation + matrix_multiply(trans_matrix, input_tile, tmp_tile); + + // Second transformation + matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile); + + // Store the 4x4 output tile across the 16 channels + const int output_offset = w + z * num_filters; + out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4]; + out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4]; + out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4]; + out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4]; + out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4]; + out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4]; + out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4]; + out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4]; + out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4]; + out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4]; + out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4]; + out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4]; + out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4]; + out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4]; + out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4]; + out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4]; + } + } + } +} + template void winograd_input_transform3x3(const SimpleTensor &src, SimpleTensor &dst, const PadStrideInfo &conv_info) { @@ -112,56 +193,70 @@ void winograd_input_transform3x3(const SimpleTensor &src, SimpleTensor &ds } template -void winograd_filter_transform3x3(const SimpleTensor &in, SimpleTensor &out) +void winograd_output_transform3x3(const SimpleTensor &in, SimpleTensor &out, int num_tiles_x) { + ARM_COMPUTE_ERROR_ON(in.shape()[2] != 16); + ARM_COMPUTE_ERROR_ON(in.shape()[0] != out.shape()[2]); + // Simple tensor for the 3x3 input tile - SimpleTensor input_tile{ TensorShape(3u, 3u), in.data_type(), 1 }; + SimpleTensor input_tile{ TensorShape(4u, 4u), in.data_type(), 1 }; // Simple tensor for the transformation matrix - SimpleTensor trans_matrix{ TensorShape(3u, 4u), in.data_type(), 1 }; + SimpleTensor trans_matrix{ TensorShape(4u, 2u), in.data_type(), 1 }; // Simple tensor for the transformation matrix transpose - SimpleTensor trans_matrix_transposed{ TensorShape(4u, 3u), in.data_type(), 1 }; + SimpleTensor trans_matrix_transposed{ TensorShape(2u, 4u), in.data_type(), 1 }; // Simple tensor for the 4x3 temporary tile - SimpleTensor tmp_tile{ TensorShape(3u, 4u), in.data_type(), 1 }; + SimpleTensor tmp_tile{ TensorShape(4u, 2u), in.data_type(), 1 }; // Simple tensor for the 4x4 output tile - SimpleTensor output_tile{ TensorShape(4u, 4u), in.data_type(), 1 }; + SimpleTensor output_tile{ TensorShape(2u, 2u), in.data_type(), 1 }; // Initialize transformation matrix - // 1 | 0 | 0 - // 0.5 | 0.5 | 0.5 - // 0.5 |-0.5 | 0.5 - // 0 | 0 | 1 - trans_matrix[0 + 0 * 3] = 1.0f; - trans_matrix[1 + 0 * 3] = 0.0f; - trans_matrix[2 + 0 * 3] = 0.0f; - trans_matrix[0 + 1 * 3] = 0.5f; - trans_matrix[1 + 1 * 3] = 0.5f; - trans_matrix[2 + 1 * 3] = 0.5f; - trans_matrix[0 + 2 * 3] = 0.5f; - trans_matrix[1 + 2 * 3] = -0.5f; - trans_matrix[2 + 2 * 3] = 0.5f; - trans_matrix[0 + 3 * 3] = 0.0f; - trans_matrix[1 + 3 * 3] = 0.0f; - trans_matrix[2 + 3 * 3] = 1.0f; + // 1 | 1 | 1 | 1 + // 0 | 1 | -1 | -1 + trans_matrix[0 + 0 * 4] = 1.0f; + trans_matrix[1 + 0 * 4] = 1.0f; + trans_matrix[2 + 0 * 4] = 1.0f; + trans_matrix[3 + 0 * 4] = 0.0f; + trans_matrix[0 + 1 * 4] = 0.0f; + trans_matrix[1 + 1 * 4] = 1.0f; + trans_matrix[2 + 1 * 4] = -1.0f; + trans_matrix[3 + 1 * 4] = -1.0f; // Transpose the transformation matrix transpose_matrix(trans_matrix, trans_matrix_transposed); - const int num_channels = in.shape()[2]; - const int num_filters = in.shape()[3]; - const int num_batches = in.shape().total_size() / (9 * num_channels * num_filters); + const int w_in = in.shape()[0]; + const int h_in = in.shape()[1]; + const int c_in = in.shape()[2]; + const int w_out = out.shape()[0]; + const int h_out = out.shape()[1]; + const int c_out = out.shape()[2]; + const int num_batches = in.shape().total_size() / (w_in * h_in * c_in); + + // Input strides + const int stridey_in = w_in; + const int stridez_in = stridey_in * h_in; + const int stridew_in = stridez_in * c_in; + + // Output strides + const int stridey_out = w_out; + const int stridez_out = stridey_out * h_out; + const int stridew_out = stridez_out * c_out; for(int n = 0; n < num_batches; ++n) { - for(int w = 0; w < num_filters; ++w) + for(int y = 0; y < h_in; ++y) { - for(int z = 0; z < num_channels; ++z) + for(int x = 0; x < w_in; ++x) { - // Load the 3x3 tile from the input tensor - get_tile(in, input_tile, Coordinates(0, 0, z, w, n)); + // Load the 4x4 tile across the 16 channels of the input tensor + for(int z = 0; z < c_in; ++z) + { + input_tile[z] = in[x + (y * stridey_in) + (z * stridez_in) + (n * stridew_in)]; + } // First transformation matrix_multiply(trans_matrix, input_tile, tmp_tile); @@ -169,24 +264,29 @@ void winograd_filter_transform3x3(const SimpleTensor &in, SimpleTensor &ou // Second transformation matrix_multiply(tmp_tile, trans_matrix_transposed, output_tile); - // Store the 4x4 output tile across the 16 channels - const int output_offset = w + z * num_filters; - out[output_offset + 0 * num_filters * num_channels] = output_tile[0 + 0 * 4]; - out[output_offset + 1 * num_filters * num_channels] = output_tile[1 + 0 * 4]; - out[output_offset + 2 * num_filters * num_channels] = output_tile[2 + 0 * 4]; - out[output_offset + 3 * num_filters * num_channels] = output_tile[3 + 0 * 4]; - out[output_offset + 4 * num_filters * num_channels] = output_tile[0 + 1 * 4]; - out[output_offset + 5 * num_filters * num_channels] = output_tile[1 + 1 * 4]; - out[output_offset + 6 * num_filters * num_channels] = output_tile[2 + 1 * 4]; - out[output_offset + 7 * num_filters * num_channels] = output_tile[3 + 1 * 4]; - out[output_offset + 8 * num_filters * num_channels] = output_tile[0 + 2 * 4]; - out[output_offset + 9 * num_filters * num_channels] = output_tile[1 + 2 * 4]; - out[output_offset + 10 * num_filters * num_channels] = output_tile[2 + 2 * 4]; - out[output_offset + 11 * num_filters * num_channels] = output_tile[3 + 2 * 4]; - out[output_offset + 12 * num_filters * num_channels] = output_tile[0 + 3 * 4]; - out[output_offset + 13 * num_filters * num_channels] = output_tile[1 + 3 * 4]; - out[output_offset + 14 * num_filters * num_channels] = output_tile[2 + 3 * 4]; - out[output_offset + 15 * num_filters * num_channels] = output_tile[3 + 3 * 4]; + // Store the 2x2 output tile + const int xo = (y % num_tiles_x) * 2; + const int yo = (y / num_tiles_x) * 2; + const int zo = x; + + const int output_offset = xo + (yo * stridey_out) + (zo * stridez_out) + (n * stridew_out); + out[output_offset + 0 * stridey_out + 0] = output_tile[0 + 0 * 2]; + + // Check out-of-bound writes + if(xo + 1 < w_out) + { + out[output_offset + 0 * stridey_out + 1] = output_tile[1 + 0 * 2]; + } + + if(yo + 1 < h_out) + { + out[output_offset + 1 * stridey_out + 0] = output_tile[0 + 1 * 2]; + } + + if((yo + 1 < h_out) && (xo + 1 < w_out)) + { + out[output_offset + 1 * stridey_out + 1] = output_tile[1 + 1 * 2]; + } } } } @@ -234,8 +334,32 @@ SimpleTensor winograd_filter_transform(const SimpleTensor &in, const Tenso return out; } +template +SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles) +{ + ARM_COMPUTE_ERROR_ON_MSG(in.data_layout() != DataLayout::NCHW, "Only supported NCHW data format"); + ARM_COMPUTE_ERROR_ON(kernel_dims.width != kernel_dims.height); + ARM_COMPUTE_ERROR_ON(in.shape()[1] != num_tiles.area()); + + // Create reference + SimpleTensor out{ output_shape, in.data_type(), 1 }; + + switch(kernel_dims.width) + { + case 3: + winograd_output_transform3x3(in, out, num_tiles.width); + break; + default: + ARM_COMPUTE_ERROR("Only supported 3x3 kernel"); + break; + } + + return out; +} + template SimpleTensor winograd_input_transform(const SimpleTensor &src, const TensorShape &dst_shape, const PadStrideInfo &conv_info, const Size2D &kernel_dims); template SimpleTensor winograd_filter_transform(const SimpleTensor &in, const TensorShape &output_shape); +template SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h index ba8e5c1cb6..fa1a7f3f61 100644 --- a/tests/validation/reference/Winograd.h +++ b/tests/validation/reference/Winograd.h @@ -41,6 +41,9 @@ SimpleTensor winograd_input_transform(const SimpleTensor &src, const Tenso template SimpleTensor winograd_filter_transform(const SimpleTensor &in, const TensorShape &output_shape); + +template +SimpleTensor winograd_output_transform(const SimpleTensor &in, const TensorShape &output_shape, const Size2D &kernel_dims, const Size2D &num_tiles); } // namespace reference } // namespace validation } // namespace test -- cgit v1.2.1