From f932d2c8409831cb9cb97a2eb65be93ad4709cd6 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Mon, 6 Jul 2020 11:27:21 +0100 Subject: COMPMID-3386: Support memory injection in CLConcatenate functions/kernels Signed-off-by: Georgios Pinitas Change-Id: I611adf4f506d406540e920b0bd6befb4b5108918 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3601 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- .../CL/kernels/CLBatchConcatenateLayerKernel.h | 20 +-- .../CL/kernels/CLDepthConcatenateLayerKernel.h | 22 +-- .../CL/kernels/CLHeightConcatenateLayerKernel.h | 21 +-- .../CL/kernels/CLWidthConcatenate2TensorsKernel.h | 21 +-- .../CL/kernels/CLWidthConcatenate4TensorsKernel.h | 23 +--- .../CL/kernels/CLWidthConcatenateLayerKernel.h | 19 +-- arm_compute/runtime/CL/ICLOperator.h | 2 +- .../runtime/CL/functions/CLConcatenateLayer.h | 69 ++++++++-- arm_compute/runtime/CL/functions/CLLSTMLayer.h | 9 +- .../CL/kernels/CLBatchConcatenateLayerKernel.cpp | 49 ++++--- .../CL/kernels/CLDepthConcatenateLayerKernel.cpp | 41 +++--- .../CL/kernels/CLHeightConcatenateLayerKernel.cpp | 44 +++--- .../kernels/CLWidthConcatenate2TensorsKernel.cpp | 68 ++++----- .../kernels/CLWidthConcatenate4TensorsKernel.cpp | 94 +++++++------ .../CL/kernels/CLWidthConcatenateLayerKernel.cpp | 39 +++--- src/graph/backends/CL/CLFunctionsFactory.cpp | 2 +- src/runtime/CL/functions/CLConcatenateLayer.cpp | 152 ++++++++++++++------- src/runtime/CL/functions/CLLSTMLayer.cpp | 26 ++-- .../NEON/functions/NEGEMMAssemblyDispatch.cpp | 3 +- tests/validation/CL/BatchConcatenateLayer.cpp | 8 +- tests/validation/CL/DepthConcatenateLayer.cpp | 8 +- tests/validation/CL/WidthConcatenateLayer.cpp | 8 +- .../GLES_COMPUTE/DepthConcatenateLayer.cpp | 4 +- .../validation/fixtures/ConcatenateLayerFixture.h | 4 +- 24 files changed, 380 insertions(+), 376 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h index 737aa79727..890fecf0c9 100644 --- a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h @@ -50,17 +50,6 @@ public: CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default; /** Default destructor */ ~CLBatchConcatenateLayerKernel() = default; - /** Initialise the kernel's inputs and output - * - * @param[in] input Input tensor. Data types supported: All. - * @param[in] batch_offset The offset on axis # 3. - * @param[in,out] output Output tensor. Data types supported: Same as @p input. - * - * @note: The output tensor's low two dimensions can't be smaller than the input one's. - * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. - * - */ - void configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output); /** Initialise the kernel's inputs and output * * @param[in] compile_context The compile context to be used. @@ -72,7 +61,7 @@ public: * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output); + void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel * * @param[in] input Input tensor info. Data types supported: All. @@ -84,12 +73,11 @@ public: static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input; - ICLTensor *_output; - unsigned int _batch_offset; + unsigned int _batch_offset; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */ diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h index f9a6bb947d..ad365d655b 100644 --- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h @@ -30,8 +30,6 @@ namespace arm_compute { -class ICLTensor; - /** Interface for the depth concatenate kernel. * The input tensor will be concatenated into the output tensor. */ @@ -50,17 +48,6 @@ public: CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default; /** Default destructor */ ~CLDepthConcatenateLayerKernel() = default; - /** Initialise the kernel's inputs and output - * - * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] depth_offset The offset on the Z axis. - * @param[in,out] output Output tensor. Data types supported: Same as @p input. - * - * @note: The output tensor's low two dimensions can't be smaller than the input one's. - * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. - * - */ - void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output); /** Initialise the kernel's inputs and output * * @param[in] compile_context The compile context to be used. @@ -72,7 +59,7 @@ public: * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output); + void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel * * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 @@ -84,12 +71,11 @@ public: static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input; - ICLTensor *_output; - unsigned int _depth_offset; + unsigned int _depth_offset; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */ diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h index c1b7d6be77..0563fad414 100644 --- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h @@ -30,8 +30,6 @@ namespace arm_compute { -class ICLTensor; - /** Interface for the height concatenate kernel. * The input tensor will be concatenated into the output tensor. */ @@ -50,14 +48,6 @@ public: CLHeightConcatenateLayerKernel &operator=(CLHeightConcatenateLayerKernel &&) = default; /** Default destructor */ ~CLHeightConcatenateLayerKernel() = default; - /** Initialise the kernel's inputs and output - * - * @param[in] input Input tensor. Data types supported: All. - * @param[in] height_offset The starting offset on the Y axis for the output tensor. - * @param[out] output Output tensor. Data types supported: Same as @p input. - * - */ - void configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output); /** Initialise the kernel's inputs and output * * @param[in] compile_context The compile context to be used. @@ -66,7 +56,7 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output); + void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel * * @param[in] input Input tensor info. Data types supported: All. @@ -78,13 +68,12 @@ public: static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input; - ICLTensor *_output; - unsigned int _height_offset; - unsigned int _num_elems_processed_per_iteration; + unsigned int _height_offset; + unsigned int _num_elems_processed_per_iteration; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */ diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h index e9b3546439..aaedaa90ee 100644 --- a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h +++ b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h @@ -30,8 +30,6 @@ namespace arm_compute { -class ICLTensor; - /** Interface for the width concatenate kernel of 2 tensors. * The input1 and input2 tensors will be concatenated into the output tensor. */ @@ -39,7 +37,7 @@ class CLWidthConcatenate2TensorsKernel : public ICLKernel { public: /** Default constructor */ - CLWidthConcatenate2TensorsKernel(); + CLWidthConcatenate2TensorsKernel() = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -50,13 +48,6 @@ public: CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default; /** Default destructor */ ~CLWidthConcatenate2TensorsKernel() = default; - /** Initialise the kernel's input1s and output - * - * @param[in] input1 First input tensor. Data types supported: All. - * @param[in] input2 Second input tensor. Data types supported: same as @p input1 - * @param[out] output Output tensor. Data types supported: Same as @p input1. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); /** Initialise the kernel's input1s and output * * @param[in] compile_context The compile context to be used. @@ -64,7 +55,7 @@ public: * @param[in] input2 Second input tensor. Data types supported: same as @p input1 * @param[out] output Output tensor. Data types supported: Same as @p input1. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel * * @param[in] input1 First tensor info. Data types supported: All. @@ -76,12 +67,8 @@ public: static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - ICLTensor *_output; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) override; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */ diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h index d3845b68f0..b1d6e890a0 100644 --- a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h +++ b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h @@ -30,8 +30,6 @@ namespace arm_compute { -class ICLTensor; - /** Interface for the width concatenate kernel of 4 tensors. * All input tensors will be concatenated into the output tensor. */ @@ -50,15 +48,6 @@ public: CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default; /** Default destructor */ ~CLWidthConcatenate4TensorsKernel() = default; - /** Initialise the kernel's input1s and output - * - * @param[in] input1 First input tensor. Data types supported: All. - * @param[in] input2 Second input tensor. Data types supported: same as @p input1 - * @param[in] input3 Third input tensor. Data types supported: same as @p input1 - * @param[in] input4 Fourth input tensor. Data types supported: same as @p input1 - * @param[out] output Output tensor. Data types supported: Same as @p input1. - */ - void configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output); /** Initialise the kernel's input1s and output * * @param[in] compile_context The compile context to be used. @@ -68,7 +57,7 @@ public: * @param[in] input4 Fourth input tensor. Data types supported: same as @p input1 * @param[out] output Output tensor. Data types supported: Same as @p input1. */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output); + void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel * * @param[in] input1 First tensor info. Data types supported: All. @@ -82,14 +71,8 @@ public: static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input1; - const ICLTensor *_input2; - const ICLTensor *_input3; - const ICLTensor *_input4; - ICLTensor *_output; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) override; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */ diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h index a42ab5126e..cb8777d5fd 100644 --- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h @@ -30,8 +30,6 @@ namespace arm_compute { -class ICLTensor; - /** Interface for the width concatenate kernel. * The input tensor will be concatenated into the output tensor. */ @@ -50,14 +48,6 @@ public: CLWidthConcatenateLayerKernel &operator=(CLWidthConcatenateLayerKernel &&) = default; /** Default destructor */ ~CLWidthConcatenateLayerKernel() = default; - /** Initialise the kernel's inputs and output - * - * @param[in] input Input tensor. Data types supported: All. - * @param[in] width_offset The offset on the X axis. - * @param[in,out] output Output tensor. Data types supported: Same as @p input. - * - */ - void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output); /** Initialise the kernel's inputs and output * * @param[in] compile_context The compile context to be used. @@ -66,7 +56,7 @@ public: * @param[in,out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output); + void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel * * @param[in] input Input tensor info. Data types supported: All. @@ -78,12 +68,11 @@ public: static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output); // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; + void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) override; private: - const ICLTensor *_input; - ICLTensor *_output; - unsigned int _width_offset; + unsigned int _width_offset; }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */ diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h index 7ad0c8ce0e..c9fdd864de 100644 --- a/arm_compute/runtime/CL/ICLOperator.h +++ b/arm_compute/runtime/CL/ICLOperator.h @@ -54,7 +54,7 @@ public: ICLOperator &operator=(ICLOperator &&) = default; // Inherited methods overridden: - void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override final; + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; void prepare(OperatorTensorMap constants) override final; protected: diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h index c5869345c7..4e8a95be43 100644 --- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h +++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h @@ -24,6 +24,7 @@ #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H #define ARM_COMPUTE_CLCONCATENATELAYER_H +#include "arm_compute/runtime/CL/ICLOperator.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/CL/ICLKernel.h" @@ -51,16 +52,25 @@ class CLConcatenateLayer : public IFunction public: /** Default constructor */ CLConcatenateLayer(); + /** Destructor */ + ~CLConcatenateLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLConcatenateLayer(const CLConcatenateLayer &) = delete; + /** Default move constructor */ + CLConcatenateLayer(CLConcatenateLayer &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLConcatenateLayer &operator=(const CLConcatenateLayer &) = delete; + /** Default move assignment operator */ + CLConcatenateLayer &operator=(CLConcatenateLayer &&); /** Initialise the kernel's inputs vector and output. * * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel. * - * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All. + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All * @param[out] output Output tensor. Data types supported: Same as @p input. * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. */ - void configure(std::vector &inputs_vector, ICLTensor *output, size_t axis); void configure(std::vector &inputs_vector, ICLTensor *output, size_t axis); /** Initialise the kernel's inputs vector and output. * @@ -68,11 +78,10 @@ public: * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel. * * @param[in] compile_context The compile context to be used. - * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All. + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All * @param[out] output Output tensor. Data types supported: Same as @p input. * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. */ - void configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis); void configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis); /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer * @@ -85,22 +94,64 @@ public: * * @return a status */ - static Status validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); static Status validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); // Inherited methods overridden: void run() override; private: - template - void configure_internal(const CLCompileContext &compile_context, std::vector &&inputs_vector, ICLTensor *output, size_t axis); + struct Impl; + std::unique_ptr _impl; +}; + +namespace experimental +{ +/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: + * + * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0). + * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1). + * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2). + * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3). + */ +class CLConcatenateLayer : public ICLOperator +{ +public: + /** Default constructor */ + CLConcatenateLayer(); + /** Initialise the kernel's inputs vector and output. + * + * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel. + * + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All + * @param[out] output Output tensor. Data types supported: Same as @p input. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + */ + void configure(const CLCompileContext &compile_context, const std::vector &inputs_vector, ITensorInfo *output, size_t axis); + /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer + * + * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel. + * + * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All + * @param[in] output Output tensor info. Data types supported: Same as @p input. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + * + * @return a status + */ + static Status validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); - template - static Status validate_internal(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis); + // Inherited methods overridden: + MemoryRequirements workspace() const override; + void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override; +private: std::vector> _concat_kernels; unsigned int _num_inputs; unsigned int _axis; }; +} // namespace experimental } // namespace arm_compute #endif /* ARM_COMPUTE_CLCONCATENATELAYER_H */ diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h index 2e44eed6f6..e5733cd784 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h @@ -30,7 +30,6 @@ #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" -#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLActivationLayer.h" @@ -232,10 +231,10 @@ private: CLCopyKernel _copy_cell_state; CLCopyKernel _copy_output; CLConcatenateLayer _concat_scratch_buffer; - CLWidthConcatenate2TensorsKernel _concat_inputs_forget_gate; - CLWidthConcatenate2TensorsKernel _concat_weights_forget_gate; - CLWidthConcatenate2TensorsKernel _concat_weights_input_gate; - CLWidthConcatenate2TensorsKernel _concat_weights_output; + CLConcatenateLayer _concat_inputs_forget_gate; + CLConcatenateLayer _concat_weights_forget_gate; + CLConcatenateLayer _concat_weights_input_gate; + CLConcatenateLayer _concat_weights_output; CLMemsetKernel _ones_memset_kernel; CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate; CLPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff; diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp index 9c2b68acc0..b3496f7229 100644 --- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "support/StringSupport.h" @@ -75,34 +75,27 @@ Status validate_arguments(const ITensorInfo *input, unsigned int batch_offset, c } // namespace CLBatchConcatenateLayerKernel::CLBatchConcatenateLayerKernel() - : _input(nullptr), _output(nullptr), _batch_offset(0) + : _batch_offset(0) { } -void CLBatchConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, batch_offset, output); -} - -void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output) +void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), batch_offset, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, batch_offset, output)); - _input = input; - _output = output; _batch_offset = batch_offset; - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); // Add build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info()) { - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); @@ -114,13 +107,13 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co _kernel = create_kernel(compile_context, "concatenate", build_opts.options()); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), batch_offset, output->info()); + auto win_config = validate_and_configure_window(input, batch_offset, output); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); // Set config_id for enabling LWS tuning _config_id = "concatenate_"; @@ -128,13 +121,13 @@ void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_co _config_id += "_"; _config_id += support::cpp11::to_string(batch_offset); _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += support::cpp11::to_string(input->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(1)); + _config_id += support::cpp11::to_string(input->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(2)); + _config_id += support::cpp11::to_string(input->dimension(2)); _config_id += "_"; - _config_id += support::cpp11::to_string(input->info()->dimension(3)); + _config_id += support::cpp11::to_string(input->dimension(3)); } Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input, @@ -146,14 +139,18 @@ Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *i return Status{}; } -void CLBatchConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue) +void CLBatchConcatenateLayerKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + Window slice = window.first_slice_window_3D(); - const int offset_to_first_elements_in_bytes = _batch_offset * _output->info()->strides_in_bytes()[3]; + const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3]; unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg(idx, offset_to_first_elements_in_bytes); @@ -161,8 +158,8 @@ void CLBatchConcatenateLayerKernel::run(const Window &window, cl::CommandQueue & do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_3D(slice)); diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp index 10aa615c6f..0bae901baa 100644 --- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "support/StringSupport.h" @@ -73,34 +73,27 @@ Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, c } // namespace CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel() - : _input(nullptr), _output(nullptr), _depth_offset(0) + : _depth_offset(0) { } -void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, depth_offset, output); -} - -void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) +void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), depth_offset, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, depth_offset, output)); - _input = input; - _output = output; _depth_offset = depth_offset; - const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); // Add build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info()) { - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); @@ -112,13 +105,13 @@ void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_co _kernel = create_kernel(compile_context, "concatenate", build_opts.options()); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info()); + auto win_config = validate_and_configure_window(input, depth_offset, output); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); } Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input, @@ -130,14 +123,18 @@ Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *i return Status{}; } -void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue) +void CLDepthConcatenateLayerKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + Window slice = window.first_slice_window_3D(); - const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2]; + const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2]; unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters _kernel.setArg(idx, offset_to_first_elements_in_bytes); @@ -145,8 +142,8 @@ void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue & do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); } while(window.slide_window_slice_3D(slice)); diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp index b3bae4604f..5ab270cc01 100644 --- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp @@ -27,19 +27,14 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "support/StringSupport.h" -#include - namespace arm_compute { namespace @@ -77,7 +72,7 @@ Status validate_arguments(const ITensorInfo *input, unsigned int height_offset, } // namespace CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel() - : _input(nullptr), _output(nullptr), _height_offset(0), _num_elems_processed_per_iteration() + : _height_offset(0), _num_elems_processed_per_iteration() { } @@ -89,33 +84,26 @@ Status CLHeightConcatenateLayerKernel::validate(const ITensorInfo *input, unsign return Status{}; } -void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, height_offset, output); -} - -void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output) +void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output)); - _input = input; - _output = output; _height_offset = height_offset; - auto win_config = validate_and_configure_window(input->info(), output->info(), _num_elems_processed_per_iteration); + auto win_config = validate_and_configure_window(input, output, _num_elems_processed_per_iteration); // Add build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2))); - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info()) { - const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq_info = input->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); @@ -132,17 +120,21 @@ void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_c ICLKernel::configure_internal(std::get<1>(win_config)); // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); } -void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue) +void CLHeightConcatenateLayerKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, window); - add_4D_tensor_argument(idx, _output, window); + add_4D_tensor_argument(idx, src, window); + add_4D_tensor_argument(idx, dst, window); enqueue(queue, *this, window, lws_hint()); } } // namespace arm_compute diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp index c87290d04c..88b69c7cce 100644 --- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp @@ -28,11 +28,10 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_info.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "support/StringSupport.h" @@ -79,11 +78,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, } } // namespace -CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) -{ -} - Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output)); @@ -91,35 +85,26 @@ Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, con return Status{}; } -void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output); -} - -void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _output = output; + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output)); // Add build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0))); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size())); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2))); + build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0))); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size())); // If input have different quantization info set quantization parameters needed for the re-quantization process - const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info()); - if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo) + const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2); + if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo) { - const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset)); build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); @@ -133,16 +118,16 @@ void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options()); // Configure kernel window - auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); + auto win_config = validate_and_configure_window(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); // Pass paddings as arguments to the kernel - const unsigned int input1_width = input1->info()->dimension(0); + const unsigned int input1_width = input1->dimension(0); const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width; const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration; unsigned int idx0 = 3 * num_arguments_per_4D_tensor(); @@ -151,30 +136,35 @@ void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile // Set config_id for enabling LWS tuning _config_id = "concatenate_width_x2_"; - _config_id += lower_string(string_from_data_type(input1->info()->data_type())); + _config_id += lower_string(string_from_data_type(input1->data_type())); _config_id += "_"; - _config_id += support::cpp11::to_string(input1->info()->dimension(0)); + _config_id += support::cpp11::to_string(input1->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input1->info()->dimension(1)); + _config_id += support::cpp11::to_string(input1->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(input2->info()->dimension(0)); + _config_id += support::cpp11::to_string(input2->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input2->info()->dimension(1)); + _config_id += support::cpp11::to_string(input2->dimension(1)); } -void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue) +void CLWidthConcatenate2TensorsKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); Window slice = window.first_slice_window_4D(); + const auto src0 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_VEC)); + const auto src1 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_VEC + 1)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + do { unsigned int idx = 0; - add_4D_tensor_argument(idx, _input1, slice); - add_4D_tensor_argument(idx, _input2, slice); - add_4D_tensor_argument(idx, _output, slice); + add_4D_tensor_argument(idx, src0, slice); + add_4D_tensor_argument(idx, src1, slice); + add_4D_tensor_argument(idx, dst, slice); enqueue(queue, *this, window, lws_hint()); } while(window.slide_window_slice_4D(slice)); diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp index 84744a68aa..e49bb1c8ce 100644 --- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_info.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "support/StringSupport.h" @@ -97,7 +98,6 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, } // namespace CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel() - : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr) { } @@ -108,42 +108,33 @@ Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, con return Status{}; } -void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input1, input2, input3, input4, output); -} - -void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, - ICLTensor *output) +void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *input1, ITensorInfo *input2, + ITensorInfo *input3, ITensorInfo *input4, + ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info())); - - _input1 = input1; - _input2 = input2; - _input3 = input3; - _input4 = input4; - _output = output; + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, input3, input4, output)); // Add build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); - build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0))); - build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0))); - build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0))); - build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size())); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2))); + build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0))); + build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0))); + build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->dimension(0))); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size())); // If input have different quantization info set quantization parameters needed for the re-quantization process - const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info(), input3->info(), input4->info()); - if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo) + const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2, input3, input4); + if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo) { - const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq3_info = input3->info()->quantization_info().uniform(); - const UniformQuantizationInfo iq4_info = input4->info()->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform(); + const UniformQuantizationInfo iq3_info = input3->quantization_info().uniform(); + const UniformQuantizationInfo iq4_info = input4->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output->quantization_info().uniform(); build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset)); build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); @@ -161,18 +152,18 @@ void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options()); // Configure kernel window - auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info()); + auto win_config = validate_and_configure_window(input1, input2, input3, input4, output); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); // Pass paddings as arguments to the kernel - const unsigned int input1_width = input1->info()->dimension(0); - const unsigned int input2_width = input2->info()->dimension(0); - const unsigned int input3_width = input3->info()->dimension(0); + const unsigned int input1_width = input1->dimension(0); + const unsigned int input2_width = input2->dimension(0); + const unsigned int input3_width = input3->dimension(0); const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width; const unsigned int input2_left_padding = input1_width % num_elems_processed_per_iteration; @@ -192,40 +183,47 @@ void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile // Set config_id for enabling LWS tuning _config_id = "concatenate_width_x4_"; - _config_id += lower_string(string_from_data_type(input1->info()->data_type())); + _config_id += lower_string(string_from_data_type(input1->data_type())); _config_id += "_"; - _config_id += support::cpp11::to_string(input1->info()->dimension(0)); + _config_id += support::cpp11::to_string(input1->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input1->info()->dimension(1)); + _config_id += support::cpp11::to_string(input1->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(input2->info()->dimension(0)); + _config_id += support::cpp11::to_string(input2->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input2->info()->dimension(1)); + _config_id += support::cpp11::to_string(input2->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(input3->info()->dimension(0)); + _config_id += support::cpp11::to_string(input3->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input3->info()->dimension(1)); + _config_id += support::cpp11::to_string(input3->dimension(1)); _config_id += "_"; - _config_id += support::cpp11::to_string(input4->info()->dimension(0)); + _config_id += support::cpp11::to_string(input4->dimension(0)); _config_id += "_"; - _config_id += support::cpp11::to_string(input4->info()->dimension(1)); + _config_id += support::cpp11::to_string(input4->dimension(1)); } -void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue) +void CLWidthConcatenate4TensorsKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src0 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_VEC)); + const auto src1 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_VEC + 1)); + const auto src2 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_VEC + 2)); + const auto src3 = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC_VEC + 3)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + Window slice = window.first_slice_window_4D(); do { unsigned int idx = 0; - add_4D_tensor_argument(idx, _input1, slice); - add_4D_tensor_argument(idx, _input2, slice); - add_4D_tensor_argument(idx, _input3, slice); - add_4D_tensor_argument(idx, _input4, slice); - add_4D_tensor_argument(idx, _output, slice); + add_4D_tensor_argument(idx, src0, slice); + add_4D_tensor_argument(idx, src1, slice); + add_4D_tensor_argument(idx, src2, slice); + add_4D_tensor_argument(idx, src3, slice); + add_4D_tensor_argument(idx, dst, slice); enqueue(queue, *this, window, lws_hint()); } while(window.slide_window_slice_4D(slice)); diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp index 20cb962b7e..591c26f877 100644 --- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/CL/CLValidate.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Cast.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "support/StringSupport.h" @@ -74,7 +74,7 @@ Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, c } // namespace CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel() - : _input(nullptr), _output(nullptr), _width_offset(0) + : _width_offset(0) { } @@ -85,31 +85,24 @@ Status CLWidthConcatenateLayerKernel::validate(const ITensorInfo *input, unsigne return Status{}; } -void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, width_offset, output); -} - -void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output) +void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), width_offset, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, width_offset, output)); - _input = input; - _output = output; _width_offset = width_offset; // Add build options CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset)); - build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2))); - if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info()) { - const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform(); - const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform(); + const UniformQuantizationInfo iqinfo = input->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset)); build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset)); @@ -120,23 +113,27 @@ void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_co // Create kernel _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options()); // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), width_offset, output->info()); + auto win_config = validate_and_configure_window(input, width_offset, output); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); ICLKernel::configure_internal(std::get<1>(win_config)); // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); } -void CLWidthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue) +void CLWidthConcatenateLayerKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, + const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + const auto src = utils::cast::polymorphic_downcast(inputs.at(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(outputs.at(TensorType::ACL_DST)); + unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, window); - add_4D_tensor_argument(idx, _output, window); + add_4D_tensor_argument(idx, src, window); + add_4D_tensor_argument(idx, dst, window); enqueue(queue, *this, window, lws_hint()); } } // namespace arm_compute diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp index ceff6e5cef..d41da4bf7f 100644 --- a/src/graph/backends/CL/CLFunctionsFactory.cpp +++ b/src/graph/backends/CL/CLFunctionsFactory.cpp @@ -42,7 +42,7 @@ namespace backends struct CLTargetInfo { using TensorType = arm_compute::ICLTensor; - using SrcTensorType = TensorType; + using SrcTensorType = const arm_compute::ICLTensor; using TensorConcreteType = CLTensor; static Target TargetType; }; diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp index a4e8665d10..06903d2ff2 100644 --- a/src/runtime/CL/functions/CLConcatenateLayer.cpp +++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp @@ -40,6 +40,8 @@ namespace arm_compute { +namespace experimental +{ CLConcatenateLayer::CLConcatenateLayer() : _concat_kernels(), _num_inputs(0), @@ -47,54 +49,23 @@ CLConcatenateLayer::CLConcatenateLayer() { } -void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) -{ - configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); -} - -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) -{ - configure_internal(compile_context, std::move(inputs_vector), output, axis); -} - -void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) -{ - configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); -} - -void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) -{ - configure_internal(compile_context, std::move(inputs_vector), output, axis); -} - -Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) -{ - return validate_internal(inputs_vector, output, axis); -} - -Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) -{ - return validate_internal(inputs_vector, output, axis); -} - -template -void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector &&inputs_vector, ICLTensor *output, size_t axis) +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, const std::vector &inputs_vector, ITensorInfo *output, size_t axis) { ARM_COMPUTE_ERROR_ON(output == nullptr); _axis = axis; _num_inputs = inputs_vector.size(); - std::vector inputs_vector_info(inputs_vector.size()); - std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t) + TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis); + std::vector const_inputs_vector(inputs_vector.size()); + std::transform(inputs_vector.begin(), inputs_vector.end(), const_inputs_vector.begin(), [](ITensorInfo * t) { ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); + return t; }); - TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis)); + auto_init_if_empty(*output, output_shape, 1, inputs_vector[0]->data_type()); + ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(const_inputs_vector, output, axis)); unsigned int offset = 0; switch(_axis) @@ -126,7 +97,7 @@ void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_cont { auto kernel = support::cpp14::make_unique(); kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); + offset += inputs_vector.at(i)->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } break; @@ -140,7 +111,7 @@ void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_cont { auto kernel = support::cpp14::make_unique(); kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); + offset += inputs_vector.at(i)->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } break; @@ -151,7 +122,7 @@ void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_cont { auto kernel = support::cpp14::make_unique(); kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); + offset += inputs_vector.at(i)->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } break; @@ -162,7 +133,7 @@ void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_cont { auto kernel = support::cpp14::make_unique(); kernel->configure(compile_context, inputs_vector.at(i), offset, output); - offset += inputs_vector.at(i)->info()->dimension(_axis); + offset += inputs_vector.at(i)->dimension(_axis); _concat_kernels.emplace_back(std::move(kernel)); } break; @@ -172,8 +143,7 @@ void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_cont } } -template -Status CLConcatenateLayer::validate_internal(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) { ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr); const unsigned int num_inputs = inputs_vector.size(); @@ -250,11 +220,101 @@ Status CLConcatenateLayer::validate_internal(const std::vector return Status{}; } +MemoryRequirements CLConcatenateLayer::workspace() const +{ + return MemoryRequirements{}; +} + +void CLConcatenateLayer::run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) +{ + ARM_COMPUTE_UNUSED(workspace); + + if(inputs.empty() || outputs.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + if(inputs.size() != _num_inputs) + { + ARM_COMPUTE_ERROR("Configured with different number of inputs"); + } + + if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) + { + ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); + CLScheduler::get().enqueue_op(*_concat_kernels.at(0), inputs, outputs, true); + } + else + { + int i = 0; + for(auto &k : _concat_kernels) + { + const InputTensorMap input = { { TensorType::ACL_SRC, inputs.at(ACL_SRC_VEC + i) } }; + CLScheduler::get().enqueue_op(*k, input, outputs, true); + ++i; + } + } +} +} // namespace experimental + +struct CLConcatenateLayer::Impl +{ + std::vector srcs{}; + ICLTensor *dst{ nullptr }; + unsigned int num_inputs{ 0 }; + unsigned int axis{ 0 }; + std::unique_ptr op{ nullptr }; +}; + +CLConcatenateLayer::CLConcatenateLayer() + : _impl(support::cpp14::make_unique()) +{ +} + +CLConcatenateLayer::CLConcatenateLayer(CLConcatenateLayer &&) = default; + +CLConcatenateLayer &CLConcatenateLayer::operator=(CLConcatenateLayer &&) = default; + +CLConcatenateLayer::~CLConcatenateLayer() = default; + +void CLConcatenateLayer::configure(std::vector &inputs_vector, ICLTensor *output, size_t axis) +{ + configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis); +} + +void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector &inputs_vector, ICLTensor *output, size_t axis) +{ + ARM_COMPUTE_ERROR_ON(output == nullptr); + + _impl->srcs = inputs_vector; + _impl->dst = output; + _impl->axis = axis; + _impl->num_inputs = inputs_vector.size(); + _impl->op = arm_compute::support::cpp14::make_unique(); + + std::vector inputs_vector_info; + for(unsigned int i = 0; i < inputs_vector.size(); ++i) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); + inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); + } + _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis); +} + +Status CLConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +{ + return experimental::CLConcatenateLayer::validate(inputs_vector, output, axis); +} + void CLConcatenateLayer::run() { - for(auto &kernel : _concat_kernels) + InputTensorMap srcs; + for(unsigned i = 0; i < _impl->num_inputs; ++i) { - CLScheduler::get().enqueue(*kernel, true); + srcs.insert(std::make_pair(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i))); } + const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } }; + + _impl->op->run(srcs, dst, {}); } } // namespace arm_compute diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index c07d7bc5c4..1b46baaf5e 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -110,7 +110,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_gate_out2); - _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2); + _concat_inputs_forget_gate.configure(compile_context, inputs_vector, &_forget_gate_out2, Window::DimX); std::vector weights_vector; @@ -119,7 +119,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6); + _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); @@ -188,7 +188,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2); + _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX); _memory_group.manage(&_input_gate_out1); @@ -290,7 +290,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); - _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2); + _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX); _memory_group.manage(&_output1); _memory_group.manage(&_output4); @@ -371,7 +371,7 @@ void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTe _copy_output.configure(compile_context, output_state_out, output); // Vector for holding the tensors to store in scratch buffer - std::vector scratch_inputs; + std::vector scratch_inputs; if(!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); @@ -485,7 +485,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); if(lstm_params.has_peephole_opt()) { @@ -516,7 +516,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); @@ -567,7 +567,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, in_out_weights.emplace_back(recurrent_to_output_weights); TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); - ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat)); + ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); // Validate output gate tmp ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); @@ -604,7 +604,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output)); // Validate scratch concatenation - std::vector inputs_vector_info_raw; + std::vector inputs_vector_info_raw; if(!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); @@ -623,7 +623,7 @@ void CLLSTMLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - CLScheduler::get().enqueue(_concat_inputs_forget_gate); + _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); @@ -721,12 +721,12 @@ void CLLSTMLayer::prepare() { if(!_is_prepared) { - CLScheduler::get().enqueue(_concat_weights_forget_gate); + _concat_weights_forget_gate.run(); if(!_run_cifg_opt) { - CLScheduler::get().enqueue(_concat_weights_input_gate); + _concat_weights_input_gate.run(); } - CLScheduler::get().enqueue(_concat_weights_output); + _concat_weights_output.run(); _is_prepared = true; } } diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index 8e9f393da5..3b9dde2bf7 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -438,7 +438,8 @@ void Fallback::run() const int granule_threshold = 200; scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); } - else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16 || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8) ) + else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16 + || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8)) { //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions const int granule_threshold = 200; diff --git a/tests/validation/CL/BatchConcatenateLayer.cpp b/tests/validation/CL/BatchConcatenateLayer.cpp index 327ddfc9d6..6fd189b716 100644 --- a/tests/validation/CL/BatchConcatenateLayer.cpp +++ b/tests/validation/CL/BatchConcatenateLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -70,7 +70,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( inputs_vector_info.emplace_back(std::move(input_info1)); inputs_vector_info.emplace_back(std::move(input_info2)); - std::vector inputs_vector_info_raw; + std::vector inputs_vector_info_raw; inputs_vector_info_raw.reserve(inputs_vector_info.size()); for(auto &input : inputs_vector_info) { @@ -97,8 +97,8 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - CLConcatenateLayer concat_layer; - std::vector inputs; + CLConcatenateLayer concat_layer; + std::vector inputs; inputs.emplace_back(&src1); inputs.emplace_back(&src2); inputs.emplace_back(&src3); diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp index a3dbffb4c1..4f5bd118bf 100644 --- a/tests/validation/CL/DepthConcatenateLayer.cpp +++ b/tests/validation/CL/DepthConcatenateLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -67,7 +67,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( inputs_vector_info.emplace_back(std::move(input_info1)); inputs_vector_info.emplace_back(std::move(input_info2)); - std::vector inputs_vector_info_raw; + std::vector inputs_vector_info_raw; inputs_vector_info_raw.reserve(inputs_vector_info.size()); for(auto &input : inputs_vector_info) { @@ -94,8 +94,8 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - CLConcatenateLayer concat_layer; - std::vector inputs; + CLConcatenateLayer concat_layer; + std::vector inputs; inputs.emplace_back(&src1); inputs.emplace_back(&src2); inputs.emplace_back(&src3); diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp index 665f8011f0..408fe148d7 100644 --- a/tests/validation/CL/WidthConcatenateLayer.cpp +++ b/tests/validation/CL/WidthConcatenateLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -71,7 +71,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( inputs_vector_info.emplace_back(std::move(input_info1)); inputs_vector_info.emplace_back(std::move(input_info2)); - std::vector inputs_vector_info_raw; + std::vector inputs_vector_info_raw; inputs_vector_info_raw.reserve(inputs_vector_info.size()); for(auto &input : inputs_vector_info) { @@ -98,8 +98,8 @@ TEST_CASE(Configuration, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); // Create and configure function - CLConcatenateLayer concat_layer; - std::vector inputs; + CLConcatenateLayer concat_layer; + std::vector inputs; inputs.emplace_back(&src1); inputs.emplace_back(&src2); inputs.emplace_back(&src3); diff --git a/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp b/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp index 1d183b96b9..6ad6fcf5c4 100644 --- a/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp +++ b/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ TEST_SUITE(GC) TEST_SUITE(DepthConcatenateLayer) template -using GCDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture; +using GCDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture; TEST_SUITE(Float) TEST_SUITE(FP16) diff --git a/tests/validation/fixtures/ConcatenateLayerFixture.h b/tests/validation/fixtures/ConcatenateLayerFixture.h index a56eac1fdf..e85f81cb53 100644 --- a/tests/validation/fixtures/ConcatenateLayerFixture.h +++ b/tests/validation/fixtures/ConcatenateLayerFixture.h @@ -43,11 +43,11 @@ namespace test { namespace validation { -template +template class ConcatenateLayerValidationFixture : public framework::Fixture { private: - using SrcITensorType = typename std::conditional::value, const ITensorType, ITensorType>::type; + using SrcITensorType = typename std::conditional::type; public: template -- cgit v1.2.1