diff options
21 files changed, 521 insertions, 377 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h index 2520eff5de..c0fef45afe 100644 --- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -47,7 +47,7 @@ class ICLTensor; * \end{array} \right) * @f] * - * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + * After this operation, the output matrix will have the following shape: [ height * W, ceil(width / W) ] where W = 4 * mult_interleave4x4_height */ class CLGEMMInterleave4x4Kernel : public ICLKernel { @@ -64,18 +64,20 @@ public: CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 - * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block */ - void configure(const ICLTensor *input, ICLTensor *output); + void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel * - * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 - * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. + * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. + * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height); // Inherited methods overridden void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h index 4e73d7eb13..7260c4a4f6 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,8 +58,10 @@ public: * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 * @param[in] alpha Weight of the matrix product * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel + * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped + * */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true); + void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel * * @param[in] input0 Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32 @@ -67,11 +69,13 @@ public: * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 * @param[in] alpha Weight of the matrix product * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel + * @param[in] reshape_info GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped * @param[in] gpu_target GPU Target * * @return a status */ - static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target); + static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, + GPUTarget gpu_target); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h index 8721643c1e..9a3069eab6 100644 --- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -62,7 +62,7 @@ class ICLTensor; * \end{array} \right) * @f] * - * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width * */ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel @@ -70,18 +70,20 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 - * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[out] output Output tensor. Data type supported: same as @p input + * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block */ - void configure(const ICLTensor *input, ICLTensor *output); + void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 - * @param[in] output Output tensor. Data type supported: same as @p input. + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor. Data type supported: same as @p input. + * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 5402e358b5..5197000bf9 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2018 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -824,13 +824,95 @@ private: const unsigned int _num_kernels; }; -/** GEMM Information class. This class stores the necessary information to compute GEMM functions */ +/** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape. + * + * The matrix A can only be reshaped through @ref CLGEMMInterleave4x4Kernel or @ref NEGEMMInterleave4x4Kernel or @ref GCGEMMInterleave4x4Kernel + * Note: Optionally just for @ref CLGEMMInterleave4x4Kernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block + * + * The matrix B can only be reshaped through @ref CLGEMMTranspose1xWKernel or @ref NEGEMMTranspose1xWKernel or @ref GCGEMMTranspose1xWKernel + * Note: Optionally just for @ref CLGEMMTranspose1xWKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block + * + */ +class GEMMReshapeInfo final +{ +public: + /** Default constructor */ + GEMMReshapeInfo() + : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1) + { + } + /** Constructor + * + * @param[in] m Number of matrix A rows + * @param[in] n Number of matrix B columns + * @param[in] k Number of matrix A columns or matrix B rows + * @param[in] mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block + * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block + */ + GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1) + : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height) + { + } + /** Number of matrix A rows + * + * @return the number of matrix A rows + */ + int m() const + { + return _m; + } + /** Number of matrix B columns + * + * @return the number of matrix B columns + */ + int n() const + { + return _n; + } + /** Number of matrix A columns or matrix B rows + * + * @return the number of matrix A columns or matrix B rows + */ + int k() const + { + return _k; + } + /** Multiplication factor for the width of the 1xW transposed block + * + * @return the multiplication factor for the width of the 1xW transposed block + */ + int mult_transpose1xW_width() const + { + return _mult_transpose1xW_width; + } + /** Multiplication factor for the height of the 4x4 interleaved block + * + * @return the multiplication factor for the height of the 4x4 interleaved block + */ + int mult_interleave4x4_height() const + { + return _mult_interleave4x4_height; + } + +private: + const int _m; + const int _n; + const int _k; + const int _mult_transpose1xW_width; + const int _mult_interleave4x4_height; +}; + +/** GEMM information class. This class stores the necessary information to compute GEMM functions + * + * This object also contains the information about how matrix A and matrix B have been reshaped + * + */ class GEMMInfo { public: /** Default constructor */ GEMMInfo() - : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false) + : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _reshape_info() { } /** Constructor @@ -838,9 +920,10 @@ public: * @param[in] is_a_reshaped True if the matrix A has been reshaped * @param[in] is_b_reshaped True if the matrix B has been reshaped * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run + * @param[in] reshape_info (Optional) GEMM reshape information object */ - GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run) - : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run) + GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()) + : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _reshape_info(reshape_info) { } /** Flag which specifies if the matrix A has been reshaped @@ -869,11 +952,20 @@ public: { return _reshape_b_only_on_first_run; }; + /** GEMMReshapeInfo object which stores the necessary information to understand how the matrix A and matrix B have been reshaped + * + * @return the GEMMReshapeInfo object + */ + const GEMMReshapeInfo &reshape_info() const + { + return _reshape_info; + } private: - const bool _is_a_reshaped; - const bool _is_b_reshaped; - const bool _reshape_b_only_on_first_run; + const bool _is_a_reshaped; + const bool _is_b_reshaped; + const bool _reshape_b_only_on_first_run; + GEMMReshapeInfo _reshape_info; }; /** IO formatting information class*/ diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 61834b88a9..6ecfdf0323 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,12 +39,14 @@ inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, co permute(output_shape, perm); return output_shape; } -inline TensorShape compute_interleaved_shape(const ITensorInfo &a) +inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1) { - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height + ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1); + const int interleave_width = 4 * mult_interleave4x4_height; TensorShape shape_interleaved_a{ a.tensor_shape() }; - shape_interleaved_a.set(0, a.dimension(0) * 4); - shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f)); + shape_interleaved_a.set(0, a.dimension(0) * interleave_width); + shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width))); return shape_interleaved_a; } @@ -57,12 +59,14 @@ inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b) return shape_transposed1xW_b; } -inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b) +inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width = 1) { - // The transpose1xW output matrix will have the following shape: - // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ] + // Note: mult_transpose1xW_width expresses the number of chunks with size 1x(W) we want to store on the same row + // The transpose1xW output matrix will have the following shape: + // [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width + ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1); TensorShape shape_transposed1xW_b{ b.tensor_shape() }; - const size_t transpose_width = 16 / b.element_size(); + const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width; shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width); shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width)))); diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h index ab1d8b8866..853b90df82 100644 --- a/arm_compute/graph/Graph.h +++ b/arm_compute/graph/Graph.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,7 @@ #ifndef __ARM_COMPUTE_GRAPH_GRAPH_H__ #define __ARM_COMPUTE_GRAPH_GRAPH_H__ +#include "arm_compute/core/CL/CLTypes.h" #include "arm_compute/graph/INode.h" #include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/SubTensor.h" @@ -67,9 +68,12 @@ public: * @param[in] tensor Tensor to add */ void add_tensor_object(std::unique_ptr<ITensorObject> tensor); - /** Finalizes the current node's configuration + /** Check if the OpenCL target is available */ static bool opencl_is_available(); + /** Returns the GPU target + */ + static GPUTarget gpu_target(); /** Manually sets the output of the current node * * @param[in] tmp Output info to set diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h index bf41226bda..0f144915d7 100644 --- a/arm_compute/runtime/CL/functions/CLGEMM.h +++ b/arm_compute/runtime/CL/functions/CLGEMM.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -68,7 +68,8 @@ public: * @param[in] alpha Weight of the matrix product * @param[in] beta Weight of matrix C * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run + * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping + * in case matrix A and matrix B have been already transformed. */ void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp index 8705c8ed1e..2f2c8bd182 100644 --- a/examples/graph_alexnet.cpp +++ b/examples/graph_alexnet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,8 +54,10 @@ public: constexpr float mean_b = 104.01f; /* Mean value to subtract from blue channel */ // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON - TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0); - ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT; + TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0); + + const bool is_gemm_convolution5x5 = Graph::gpu_target() == arm_compute::GPUTarget::MIDGARD || target_hint == TargetHint::NEON; + ConvolutionMethodHint convolution_5x5_hint = is_gemm_convolution5x5 ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT; // Parse arguments if(argc < 2) @@ -102,7 +104,7 @@ public: << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)) << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0))) // Layer 2 - << convolution_hint + << convolution_5x5_hint << ConvolutionLayer( 5U, 5U, 256U, get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy"), @@ -111,6 +113,7 @@ public: << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)) << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f)) << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0))) + << ConvolutionMethodHint::GEMM // Layer 3 << ConvolutionLayer( 3U, 3U, 384U, diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp index 1e9601b492..b2e2f1bf8f 100644 --- a/examples/graph_googlenet.cpp +++ b/examples/graph_googlenet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,7 @@ public: // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0); - ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT; + ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::GEMM; // Parse arguments if(argc < 2) diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp index 8c3f9b6fbc..83d1db9f0f 100644 --- a/examples/graph_mobilenet.cpp +++ b/examples/graph_mobilenet.cpp @@ -52,7 +52,7 @@ public: // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0); - ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT; + ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::GEMM; // Set model to execute. 0 (MobileNetV1_1.0_224), 1 (MobileNetV1_0.75_160) int model_id = (argc > 2) ? std::strtol(argv[2], nullptr, 10) : 0; diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp index b21f2fe5c4..e85108702d 100644 --- a/examples/graph_squeezenet.cpp +++ b/examples/graph_squeezenet.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -59,8 +59,7 @@ public: constexpr float mean_b = 104.01f; /* Mean value to subtract from blue channel */ // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON - TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0); - ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT; + TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0); // Parse arguments if(argc < 2) @@ -102,7 +101,6 @@ public: get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy"), get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"), PadStrideInfo(2, 2, 0, 0)) - << convolution_hint << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU)) << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL))) << ConvolutionLayer( diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp index 1a804a4882..d97c5b5d02 100644 --- a/examples/graph_vgg16.cpp +++ b/examples/graph_vgg16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -91,7 +91,6 @@ public: << convolution_hint << Tensor(TensorInfo(TensorShape(224U, 224U, 3U, 1U), 1, DataType::F32), get_input_accessor(image, mean_r, mean_g, mean_b)) - << ConvolutionMethodHint::DIRECT // Layer 1 << ConvolutionLayer( 3U, 3U, 64U, diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 6695881d09..ae3553860a 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -211,9 +211,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, { "gemm_accumulate_biases", "gemm.cl" }, - { "gemm_interleave4x4_8bit", "gemm.cl" }, - { "gemm_interleave4x4_16bit", "gemm.cl" }, - { "gemm_interleave4x4_32bit", "gemm.cl" }, + { "gemm_interleave4x4", "gemm.cl" }, { "gemm_ma_f16", "gemm.cl" }, { "gemm_ma_f32", "gemm.cl" }, { "gemm_ma_qs8", "gemm.cl" }, @@ -230,9 +228,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "gemm_mm_qs8", "gemm.cl" }, { "gemm_mm_qs16", "gemm.cl" }, { "gemm_lc_vm_f32", "gemm.cl" }, - { "gemm_transpose1x16", "gemm.cl" }, - { "gemm_transpose1x8", "gemm.cl" }, - { "gemm_transpose1x4", "gemm.cl" }, + { "gemm_transpose1xW", "gemm.cl" }, { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" }, { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" }, { "gemmlowp_mm_bifrost", "gemmlowp.cl" }, diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index c763cb355b..bad09f3c42 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,41 +27,23 @@ #include "fixed_point.h" #endif // FIXED_POINT_POSITION -/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - uint x = get_global_id(0); - uint y = get_global_id(1); +#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH) - // Compute address for Matrix B - source - Image src = CONVERT_TO_IMAGE_STRUCT(src); - - // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); +#if TRANSPOSE_W == 4 +#define DATA_TYPE uint +#elif TRANSPOSE_W == 8 +#define DATA_TYPE ushort +#elif TRANSPOSE_W == 16 +#define DATA_TYPE uchar +#else // TRANSPOSE_W == 16 +#error "Transpose width not supported" +#endif // TRANSPOSE_W - uint4 b0 = vload4(0, (__global uint *)src.ptr); - - vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes)); -} - -/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix +/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix + * + * @attention The multiplication factor (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16 + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) @@ -69,12 +51,12 @@ __kernel void gemm_transpose1x4(IMAGE_DECLARATION(src), * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ -__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src), +__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src), IMAGE_DECLARATION(dst)) { uint x = get_global_id(0); @@ -84,16 +66,22 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src), Image src = CONVERT_TO_IMAGE_STRUCT(src); // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); + uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y + + (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE); - ushort8 b0 = vload8(0, (__global ushort *)src.ptr); + VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W) + b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr); - vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes)); + VSTORE(TRANSPOSE_W) + (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes)); } +#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH) -/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix +#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE) + +/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8 + * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) @@ -106,9 +94,10 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src), * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ -__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src), +__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src), IMAGE_DECLARATION(dst)) { + // Compute source and destination addresses uint x = get_global_id(0); uint y = get_global_id(1); @@ -116,141 +105,35 @@ __kernel void gemm_transpose1x16(IMAGE_DECLARATION(src), Image src = CONVERT_TO_IMAGE_STRUCT(src); // Compute address for Matrix B transposed - destination. X and Y are swapped - uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes)); - - uchar16 b0 = vload16(0, (__global uchar *)src.ptr); - - vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes)); -} - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U32/S32/F32 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Compute source and destination addresses - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values from Matrix A - uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0))); - uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1))); - uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2))); - uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3))); - - uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 0); - - val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 4); - - val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 8); - - val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3); - vstore4(val0, 0, ((__global uint *)dst.ptr) + 12); -} - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U16/S16/QS16/F16 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Compute source and destination addresses - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); - - // Load values from Matrix A - ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0))); - ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1))); - ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2))); - ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3))); - - ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0); - - val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8); - - val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16); - - val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7)); - vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24); -} - -/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: U8/S8/QS8 - * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix - * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - */ -__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst)) -{ - // Compute source and destination addresses - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y + + (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE); // Load values from Matrix A - uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0))); - uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1))); - uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2))); - uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3))); - - uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1), - (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0); - - val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5), - (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16); - - val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9), - (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32); - - val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD), - (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF)); - vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48); + VEC_DATA_TYPE(DATA_TYPE, 4) + a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0))); + VEC_DATA_TYPE(DATA_TYPE, 4) + a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1))); + VEC_DATA_TYPE(DATA_TYPE, 4) + a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2))); + VEC_DATA_TYPE(DATA_TYPE, 4) + a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3))); + + VEC_DATA_TYPE(DATA_TYPE, 4) + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT)); + + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT)); + + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT)); + + val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3); + vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT)); } +#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE) -#if defined(COLS_B) +#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication * @@ -270,30 +153,32 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B - - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; - // Divide by 4 in order to get the src_addr in unit of float - src_addr = src_addr >> 2; + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global float *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators float4 c00 = 0.0f; @@ -301,11 +186,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) float4 c20 = 0.0f; float4 c30 = 0.0f; - for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8)) + for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0); - float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1); + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); c00 += (float4)a0.s0 * b0; c10 += (float4)a0.s1 * b0; @@ -313,8 +198,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) c30 += (float4)a0.s3 * b0; // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4); - b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4); + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH); c00 += (float4)a0.s0 * b0; c10 += (float4)a0.s1 * b0; @@ -322,11 +207,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) c30 += (float4)a0.s3 * b0; } - for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0); - float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1); + float4 a0 = vload4(0, src_addr_a); + float4 b0 = vload4(0, src_addr_b); c00 += (float4)a0.s0 * b0; c10 += (float4)a0.s1 * b0; @@ -371,23 +256,33 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0) * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4; + // src_addr_a = address of matrix A // src_addr_b = address of matrix B - __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes); - __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes); + __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B __global float *src_end_addr_b = src_addr_b + COLS_B; + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + // Reset accumulators float c00 = 0.0f; float c01 = 0.0f; @@ -406,7 +301,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) float c32 = 0.0f; float c33 = 0.0f; - for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16) + for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH)) { // Load values from matrix A (interleaved) and matrix B (transposed) float4 a0 = vload4(0, src_addr_a); @@ -433,8 +328,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, src_addr_a + 4); - b0 = vload4(0, src_addr_b + 4); + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH); c00 = fma(a0.s0, b0.s0, c00); c01 = fma(a0.s0, b0.s1, c01); @@ -457,8 +352,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, src_addr_a + 8); - b0 = vload4(0, src_addr_b + 8); + a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH); c00 = fma(a0.s0, b0.s0, c00); c01 = fma(a0.s0, b0.s1, c01); @@ -481,8 +376,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, src_addr_a + 12); - b0 = vload4(0, src_addr_b + 12); + a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH); c00 = fma(a0.s0, b0.s0, c00); c01 = fma(a0.s0, b0.s1, c01); @@ -505,7 +400,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = fma(a0.s3, b0.s3, c33); } - for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4) + for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH)) { // Load values from matrix A (interleaved) and matrix B (transposed) float4 a0 = vload4(0, src_addr_a); @@ -555,8 +450,6 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) c33 = c33 * ALPHA; #endif // defined(ALPHA) - barrier(CLK_GLOBAL_MEM_FENCE); - // Store 4x4 block vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0))); vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1))); @@ -584,30 +477,32 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0) * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B - - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; - // Divide by 2 in order to get the src_addr in unit of half - src_addr = src_addr >> 1; + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators half8 c00 = 0.0f; @@ -615,11 +510,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), half8 c20 = 0.0f; half8 c30 = 0.0f; - for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16)) + for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0); - half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1); + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); c00 += (half8)a0.s0 * b0; c10 += (half8)a0.s1 * b0; @@ -627,8 +522,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), c30 += (half8)a0.s3 * b0; // Load values from matrix A (interleaved) and matrix B (transposed) - a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4); - b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8); + a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT); + b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH); c00 += (half8)a0.s0 * b0; c10 += (half8)a0.s1 * b0; @@ -636,11 +531,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), c30 += (half8)a0.s3 * b0; } - for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0); - half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1); + half4 a0 = vload4(0, src_addr_a); + half8 b0 = vload8(0, src_addr_b); c00 += (half8)a0.s0 * b0; c10 += (half8)a0.s1 * b0; @@ -689,27 +584,32 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes; + __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes; // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global char *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators short8 c00 = 0.0f; @@ -722,11 +622,11 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), short8 c31 = 0.0f; // This for loop performs 1 accumulation for each iteration - for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) { // Load values from matrix A (interleaved) and matrix B (transposed) - char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0); - char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1); + char4 a0 = vload4(0, src_addr_a); + char16 b0 = vload16(0, src_addr_b); c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION); c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION); @@ -783,30 +683,32 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0), * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) - * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) - * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix */ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1), IMAGE_DECLARATION(dst)) { - // src_addr.s0 = address of matrix A - // src_addr.s1 = address of matrix B + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; - // Compute address for matrix A and B - int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y), - (src1_stride_y)); + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; - // Add offset_first_element_in_bytes - src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes)); - - // Divide by 2 in order to get the src_addr in unit of short - src_addr = src_addr >> 1; + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes); + __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes); // Compute end row address for matrix B - int end_row_mtx_b = src_addr.s1 + COLS_B; + __global short *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; // Reset accumulators int8 c00 = 0.0f; @@ -815,11 +717,11 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), int8 c30 = 0.0f; // This for loop performs 1 accumulation for each iteration - for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8)) + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) { /* Load values from matrix A (interleaved) and matrix B (transposed) */ - short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0); - short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1); + short4 a0 = vload4(0, src_addr_a); + short8 b0 = vload8(0, src_addr_b); c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION); c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION); @@ -850,7 +752,7 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0), vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3))); } #endif // defined(FIXED_POINT_POSITION) -#endif // defined(COLS_B) +#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT) #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y) #if defined(DATA_TYPE) diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp index 6886f54602..241dd8549d 100644 --- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,17 +40,16 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height) { + ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); if(output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } @@ -58,11 +57,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int mult_interleave4x4_height) { - unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_x = 4; constexpr unsigned int num_elems_processed_per_iteration_y = 4; - const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y * mult_interleave4x4_height; bool window_changed = false; // Configure kernel window @@ -73,7 +72,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen // Configure window in case of configured output if(output->total_size() != 0) { - AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height); + const float scale_y = 1.0f / (scale_x); + + AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y); window_changed = window_changed || update_window_and_padding(win, output_access); output_access.set_valid_region(win, input->valid_region()); } @@ -88,25 +90,42 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel() { } -void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output) +void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info(), mult_interleave4x4_height))); // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_interleave4x4_height)); _input = input; _output = output; + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); + switch(input->info()->element_size()) + { + case 1: + build_opts.add_option("-DDATA_TYPE=uchar"); + break; + case 2: + build_opts.add_option("-DDATA_TYPE=ushort"); + break; + case 4: + build_opts.add_option("-DDATA_TYPE=uint"); + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + } + // Create kernel - std::string kernel_name = "gemm_interleave4x4_" + support::cpp11::to_string(input->info()->element_size() * 8) + "bit"; - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts.options())); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), output->info()); + auto win_config = validate_and_configure_window(input->info(), output->info(), mult_interleave4x4_height); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure(win_config.second); @@ -119,10 +138,10 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _config_id += support::cpp11::to_string(output->info()->dimension(1)); } -Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output) +Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_interleave4x4_height)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mult_interleave4x4_height).first); return Status{}; } @@ -144,10 +163,6 @@ void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queu Window in_slice = window.first_slice_window_2D(); Window out_slice = window.first_slice_window_2D(); - // Change x and y steps for the slide of output tensor - out_slice.scale(Window::DimX, 4.f); - out_slice.scale(Window::DimY, 0.25f); - do { unsigned int idx = 0; diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 19f38bf5a5..e23feb269a 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -36,24 +36,68 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include <set> #include <string> using namespace arm_compute; +using namespace arm_compute::misc::shape_calculator; namespace { using ElementsProcessed = Steps; -inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed) +inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1); + if(!is_interleaved_transposed) { ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1)); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); + } + } + else + { + const int m = reshape_info.m(); + const int n = reshape_info.n(); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + TensorShape tensor_shape0{ input0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ input1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n)); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); + } } return Status{}; @@ -122,12 +166,19 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() { } -void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed) +void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + // Output tensor auto inizialitation if not yet initialized + TensorShape tensor_shape{ input0->info()->tensor_shape() }; + tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0)); + tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1)); + + auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape)); + // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info)); _input0 = input0; _input1 = input1; @@ -176,7 +227,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen std::string kernel_name; if(is_interleaved_transposed) { + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width)); + build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height)); + if(data_type == DataType::F32) { kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target); @@ -230,11 +287,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } -Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target) +Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, + const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target) { + // Note: num_elements_processed will be set in validate_and_configure_window() ElementsProcessed num_elements_processed{}; ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get(), diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 69a545b76b..63aed6df32 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -42,8 +42,9 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width) { + ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); @@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), - compute_transpose1xW_with_element_size_shape(*input)); + compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } @@ -59,11 +60,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width) { num_elems_processed_per_iteration = 16 / input->element_size(); - const int scale_x = num_elems_processed_per_iteration; + const int scale_x = num_elems_processed_per_iteration * mult_transpose1xW_width; bool window_changed = false; // Configure kernel window @@ -90,25 +91,31 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output) +void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info(), mult_transpose1xW_width))); // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width)); _input = input; _output = output; // Configure kernel window + // Note: num_elems_processed_per_iteration will be set in validate_and_configure_window() unsigned int num_elems_processed_per_iteration = 1; - auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration); + auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure(win_config.second); + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width)); + /* * Following an example of how the transposition1xW works when the input data type is F32 * @@ -117,18 +124,18 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | * |a30 a31 a32 a33| * - * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width */ // Create kernel - std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration); - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name)); + std::string kernel_name = "gemm_transpose1xW"; + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } -Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width) { unsigned int num_elems_processed_per_iteration = 1; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first); return Status{}; } diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp index ac5316f55e..e14bea0846 100644 --- a/src/graph/Graph.cpp +++ b/src/graph/Graph.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -215,11 +215,25 @@ void Graph::add_tensor_object(std::unique_ptr<ITensorObject> tensor) _pimpl->_graph_output->allocate(); } } + bool Graph::opencl_is_available() { return arm_compute::opencl_is_available(); } +arm_compute::GPUTarget Graph::gpu_target() +{ + // Check if OpenCL is available before returning the GPU target + if(opencl_is_available()) + { + return arm_compute::CLScheduler::get().target(); + } + else + { + return GPUTarget::MIDGARD; + } +} + void Graph::set_temp(TensorInfo &&tmp) { ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr); diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp index 68c6576a79..e9d14db96e 100644 --- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp +++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,7 +55,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } else { - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target)); } return Status{}; diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp index c676a10978..a09849ab93 100644 --- a/src/runtime/CL/functions/CLGEMM.cpp +++ b/src/runtime/CL/functions/CLGEMM.cpp @@ -38,6 +38,30 @@ using namespace arm_compute; +namespace +{ +inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target) +{ + bool flag = true; + + if(gpu_target == GPUTarget::BIFROST) + { + // COMPMID-852 + if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run) + { + const float scale = k < 1024 ? 2.0f : 2.5f; + flag = scale * n > 1.66f * n + 38.4f; + } + else + { + flag = false; + } + } + + return flag; +} +} // namespace + CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager) : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false) @@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors - // For Bifrost architectures we do not reshape the input matrices - _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST); - // Check if we need to reshape the matrix B only on the first run _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); const ICLTensor *matrix_a = a; const ICLTensor *matrix_b = b; - // Set the target for the matrix multiply kernel - _mm_kernel.set_target(CLScheduler::get().target()); + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + // Set the target for the kernels + _interleave_kernel.set_target(gpu_target); + _mm_kernel.set_target(gpu_target); + + // Arguments used by GEMMReshapeInfo + // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo + // in order to know how the matrices have been reshaped + const int m = a->info()->dimension(1); + const int n = b->info()->dimension(0); + const int k = a->info()->dimension(0); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + + if(gpu_target == GPUTarget::BIFROST) + { + mult_transpose1xW_width = 4; + mult_interleave4x4_height = 2; + } + + // Check if we need to reshape the matrix A and matrix B + _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target); if(_is_interleaved_transposed) { @@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor * // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel // Configure interleave kernel - _interleave_kernel.configure(a, &_tmp_a); + _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height); // Configure transpose kernel - _transpose_kernel.configure(b, &_tmp_b); + _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width); // Manage intermediate buffers _memory_group.manage(&_tmp_a); _memory_group.manage(&_tmp_b); } - _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed); + _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height)); if(_is_interleaved_transposed) { diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp index 2cd426b82d..5f886a02c6 100644 --- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp @@ -148,8 +148,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type()); TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, 1)); + ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, 1)); ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output)); } else |