diff options
Diffstat (limited to 'arm_compute')
-rw-r--r-- | arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h | 18 | ||||
-rw-r--r-- | arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h | 42 | ||||
-rw-r--r-- | arm_compute/runtime/CL/functions/CLGEMM.h | 2 |
3 files changed, 22 insertions, 40 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h index dec63e0679..a768a19914 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h @@ -30,10 +30,10 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha +/** OpenCL kernel to multiply two input matrices "A" and "B" . All elements of the output matrix will be multiplied by alpha * - * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref CLGEMMInterleave4x4Kernel" and @ref CLGEMMTranspose1xWKernel - * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped + * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMInterleave4x4Kernel" and @ref CLGEMMTranspose1xWKernel, + * the flag @p is_interleaved_transposed must be set to true * * @attention The second input tensor must have at least 2 dimensions (matrix) * @@ -53,13 +53,13 @@ public: CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default; /** Initialise the kernel's input, output and alpha * - * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32 - * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. - * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 - * @param[in] alpha Weight of the matrix product + * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32 + * @param[in] input1 Input tensor containing the Matrix B. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 + * @param[in] alpha Weight of the matrix product + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha); + void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index a29f68fcf1..e076f51b26 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -24,12 +24,10 @@ #ifndef __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ #define __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ -#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLTensor.h" @@ -38,41 +36,25 @@ namespace arm_compute { /** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels: * - * -# @ref CLTransposeKernel (if @p transpose_weights is set to true) - * -# @ref CLGEMMTranspose1xWKernel (if @p is_batched_fc_layer is set to true) + * -# @ref CLTransposeKernel * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ -class CLFullyConnectedLayerReshapeWeights : public IFunction +class CLFullyConnectedLayerReshapeWeights : public ICLSimpleFunction { public: - /** Constructor */ - CLFullyConnectedLayerReshapeWeights(); /** Set the input and output tensors. * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QS16/F16/F32. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] transpose_weights True if the weights must be transposed. Data types supported: Same as @p weights. - * @param[in] is_batched_fc_layer True if it is a batched fully connected layer + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. */ - void configure(const ICLTensor *input, ICLTensor *output, bool transpose_weights, bool is_batched_fc_layer); - - // Inherited methods overridden: - void run() override; - -private: - CLTransposeKernel _transpose_kernel; - CLGEMMTranspose1xWKernel _transpose1xW_kernel; - CLTensor _transpose_output; - bool _transpose_weights; - bool _is_batched_fc_layer; + void configure(const ICLTensor *input, ICLTensor *output); }; /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: * * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) - * -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false) (called once) - * -# @ref CLGEMMInterleave4x4Kernel (called if we have a multi-batch input) + * -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) * -# @ref CLGEMMMatrixMultiplyKernel * -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) * @@ -85,7 +67,7 @@ public: CLFullyConnectedLayer(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data type supported: QS8/F16/F32. + * @param[in] input Source tensor. Data type supported: QS8/QS16/F16/F32. * @param[in] weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input * @param[in] biases Bias tensor. It can be nullptr. Data type supported:Same as @p input. * @param[out] output Destination tensor. Data type supported: Same as @p input. @@ -98,17 +80,17 @@ public: void run() override; private: + void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); + void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); + CLIm2ColKernel _im2col_kernel; CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel; - CLGEMMInterleave4x4Kernel _interleave4x4_kernel; CLGEMMMatrixMultiplyKernel _mm_kernel; CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; CLTensor _im2col_output; - CLTensor _interleave4x4_output; CLTensor _reshape_weights_output; bool _are_weights_reshaped; - bool _is_batched_fc_layer; - bool _linearize_input; + bool _is_fc_after_conv; bool _accumulate_biases; }; } diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index 9207efd68f..9b887305cb 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -76,7 +76,7 @@ private: CLGEMMMatrixAdditionKernel _ma_kernel; CLTensor _tmp_a; CLTensor _tmp_b; - bool _run_vector_matrix_multiplication; + bool _is_interleaved_transposed; bool _run_addition; }; } |