From 5cb4c42cb5d781a44409ebc97a408e1379ce182d Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 23 Jun 2017 10:38:25 +0100 Subject: COMPMID-414 - Port CLConvolutionLayer to support 8 bit fixed point - CLWeightsReshapeKernel Change-Id: Ie32e6bdd557a8243eb9988aa7eab4e4ca2291e79 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78701 Tested-by: Kaizen Reviewed-by: Moritz Pflanzer --- .../core/CL/kernels/CLWeightsReshapeKernel.h | 58 ++------------- .../runtime/CL/functions/CLConvolutionLayer.h | 14 ++-- .../runtime/CL/functions/CLLocallyConnectedLayer.h | 18 ++--- docs/00_introduction.dox | 2 +- src/core/CL/CLHelpers.cpp | 2 + src/core/CL/kernels/CLWeightsReshapeKernel.cpp | 86 +++++++--------------- src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 23 ++---- 7 files changed, 60 insertions(+), 143 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h index 1dc8a8b80e..0d00f0e00e 100644 --- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h +++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h @@ -31,11 +31,8 @@ namespace arm_compute class CLWeightsReshapeKernel : public ICLKernel { public: - /** Constructor. - * - * @param[in] is_shared Flag to indicate whether the weights are shared or not. - */ - CLWeightsReshapeKernel(bool is_shared = false); + /** Constructor.*/ + CLWeightsReshapeKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -50,7 +47,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: F16, F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input @@ -58,57 +55,12 @@ public: void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output); // Inherited methods overridden: - virtual void run(const Window &window, cl::CommandQueue &queue) = 0; + void run(const Window &window, cl::CommandQueue &queue) override; -protected: - bool _is_shared; +private: const ICLTensor *_input; const ICLTensor *_biases; ICLTensor *_output; }; - -/** Interface for the weights reshape kernel used by convolution and fully connected layers. - * - * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. - * In combination with the @ref CLIm2ColKernel can transform a convolution into a matrix multiplication. - * - * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: - * @f[ - * \left( \begin{array}{ccc} - * a000 & a001 & a002 \\ - * a010 & a011 & a012 \\ - * a020 & a021 & a022 \\ - * \end{array} \right) - * \left( \begin{array}{ccc} - * a100 & a101 & a102 \\ - * a110 & a111 & a112 \\ - * a120 & a121 & a122 \\ - * \end{array} \right) - * \rightarrow - * \left( \begin{array}{ccccccccc} - * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ - * \end{array} \right) - * @f] - */ -class CLConvolutionLayerWeightsReshapeKernel : public CLWeightsReshapeKernel -{ -public: - /** Default constructor */ - CLConvolutionLayerWeightsReshapeKernel(); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; - -/** Interface for the weights reshape kernel used by locally connected layers. */ -class CLLocallyConnectedLayerWeightsReshapeKernel : public CLWeightsReshapeKernel -{ -public: - /** Default constructor */ - CLLocallyConnectedLayerWeightsReshapeKernel(); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; -}; } #endif /*__ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index 6a40396f9a..8030b40a71 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -53,7 +53,7 @@ public: CLConvolutionLayerReshapeWeights(); /** Set the input and output tensors. * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. Data types supported: Same as @p weights. * @param[in] transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise. @@ -64,16 +64,16 @@ public: void run() override; private: - CLConvolutionLayerWeightsReshapeKernel _weights_reshape_kernel; - CLGEMMTranspose1xWKernel _weights_transposed_kernel; - CLTensor _weights_reshaped; - bool _transpose1xW; + CLWeightsReshapeKernel _weights_reshape_kernel; + CLGEMMTranspose1xWKernel _weights_transposed_kernel; + CLTensor _weights_reshaped; + bool _transpose1xW; }; /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels: * - * -# @ref CLConvolutionLayerWeightsReshapeKernel (executed only once for each configuration) - * -# @ref CLGEMMTranspose1xWKernel (executed only once for each configuration) + * -# @ref CLWeightsReshapeKernel (executed only once for each configuration) + * -# @ref CLGEMMTranspose1xWKernel (executed only once for each configuration) * -# @ref CLIm2ColKernel * -# @ref CLGEMMInterleave4x4Kernel * -# @ref CLGEMMMatrixMultiplyKernel diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h index b4e469196e..5f4f1ba1d7 100644 --- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h @@ -39,7 +39,7 @@ class ICLTensor; /** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels: * - * -# @ref CLLocallyConnectedLayerWeightsReshapeKernel (executed only once for each configuration) + * -# @ref CLWeightsReshapeKernel (executed only once for each configuration) * -# @ref CLIm2ColKernel * -# @ref CLLocallyConnectedMatrixMultiplyKernel * -# @ref CLCol2ImKernel @@ -66,14 +66,14 @@ public: void run() override; private: - CLIm2ColKernel _input_im2col_kernel; - CLLocallyConnectedLayerWeightsReshapeKernel _weights_reshape_kernel; - CLLocallyConnectedMatrixMultiplyKernel _mm_kernel; - CLCol2ImKernel _output_col2im_kernel; - CLTensor _input_im2col_reshaped; - CLTensor _weights_reshaped; - CLTensor _gemm_output; - bool _is_first_run; + CLIm2ColKernel _input_im2col_kernel; + CLWeightsReshapeKernel _weights_reshape_kernel; + CLLocallyConnectedMatrixMultiplyKernel _mm_kernel; + CLCol2ImKernel _output_col2im_kernel; + CLTensor _input_im2col_reshaped; + CLTensor _weights_reshaped; + CLTensor _gemm_output; + bool _is_first_run; }; } #endif /* __ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H__ */ diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 1fb94ed637..d4b4b6f10d 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -205,7 +205,7 @@ v17.02.1 Sources preview - New OpenCL kernels / functions: - @ref arm_compute::CLLogits1DMaxKernel, @ref arm_compute::CLLogits1DShiftExpSumKernel, @ref arm_compute::CLLogits1DNormKernel / @ref arm_compute::CLSoftmaxLayer - @ref arm_compute::CLPoolingLayerKernel / @ref arm_compute::CLPoolingLayer - - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, @ref arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer + - @ref arm_compute::CLIm2ColKernel, @ref arm_compute::CLCol2ImKernel, arm_compute::CLConvolutionLayerWeightsReshapeKernel / @ref arm_compute::CLConvolutionLayer - @ref arm_compute::CLRemapKernel / @ref arm_compute::CLRemap - @ref arm_compute::CLGaussianPyramidHorKernel, @ref arm_compute::CLGaussianPyramidVertKernel / @ref arm_compute::CLGaussianPyramid, @ref arm_compute::CLGaussianPyramidHalf, @ref arm_compute::CLGaussianPyramidOrb - @ref arm_compute::CLMinMaxKernel, @ref arm_compute::CLMinMaxLocationKernel / @ref arm_compute::CLMinMaxLocation diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 4b5bbbbb49..835260d35a 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -80,6 +80,8 @@ std::string get_cl_type_from_data_type(const DataType &dt) return "ushort"; case DataType::S16: return "short"; + case DataType::QS16: + return "qs16"; case DataType::U32: return "uint"; case DataType::S32: diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp index 018f272921..845bd3799d 100644 --- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp +++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp @@ -34,32 +34,38 @@ using namespace arm_compute; -CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared) - : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr) +CLWeightsReshapeKernel::CLWeightsReshapeKernel() + : _input(nullptr), _biases(nullptr), _output(nullptr) { } void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); - if(_is_shared) - { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2))); - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5); - ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3); - } - else - { - ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4); - ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2); - } - - // Check biases - if(biases != nullptr) - { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32); - } + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1)); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2)); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3])); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4])); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1)); + + const DataType dt = input->info()->data_type(); + const int fixed_point_position = input->info()->fixed_point_position(); + + TensorShape output_shape{ input->info()->tensor_shape() }; + output_shape.collapse(3); + const size_t tmp_dim = output_shape[0]; + output_shape.set(0, output_shape[1]); + output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0)); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); _biases = biases; _output = output; @@ -88,43 +94,7 @@ void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor * ICLKernel::configure(win); } -CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel() - : CLWeightsReshapeKernel(false) -{ -} - -void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - - Window out_window; - out_window.use_tensor_dimensions(_output->info()); - - Window in_slice = window.first_slice_window_3D(); - Window out_slice = out_window.first_slice_window_2D(); - - // Set arguments - unsigned idx = 0; - add_3D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); - if(_biases != nullptr) - { - Window biases_slice; - biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1)); - add_1D_tensor_argument(idx, _biases, biases_slice); - } - - // Run kernel - enqueue(queue, *this, in_slice); -} - -CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel() - : CLWeightsReshapeKernel(true) -{ -} - -void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) +void CLWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp index 4f52bf6279..e9b76e7967 100644 --- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -95,7 +95,7 @@ NEWeightsReshapeKernel::NEWeightsReshapeKernel() void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1)); @@ -108,28 +108,21 @@ void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias output_shape.set(0, output_shape[1]); output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0)); - // Set data type and shape for output tensor if not yet configured - set_data_type_if_unknown(*output->info(), dt); - set_fixed_point_position_if_zero(*output->info(), fixed_point_position); + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position); ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); if(bias != nullptr) { - TensorShape bias_shape{ input->info()->tensor_shape()[3] }; - - // Set data type and shape for bias tensor if not yet configured - set_data_type_if_unknown(*bias->info(), dt); - set_fixed_point_position_if_zero(*bias->info(), fixed_point_position); - set_shape_if_empty(*bias->info(), bias_shape); - - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::F32, DataType::QS8); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->num_dimensions() != 1)); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->num_dimensions() != 2)); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3])); + ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3] || bias->info()->dimension(1) != input->info()->tensor_shape()[4])); } _input = input; -- cgit v1.2.1