From b30dcc5ab8eb2bd37f0ab742af1ec45113d54296 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 20 Jun 2017 09:07:21 +0100 Subject: COMPMID-345 - In-place computation for Activation Layer Change-Id: I25ebfccc3d3e758cc8164e0b33805c0bb303891a Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78226 Tested-by: Kaizen Reviewed-by: Pablo Tello Reviewed-by: Georgios Pinitas --- .../core/CL/kernels/CLActivationLayerKernel.h | 34 +++++++++-- .../core/NEON/kernels/NEActivationLayerKernel.h | 17 ++++-- .../runtime/CL/functions/CLActivationLayer.h | 11 ++-- .../runtime/NEON/functions/NEActivationLayer.h | 11 ++-- src/core/CL/cl_kernels/activation_layer.cl | 18 ++++-- src/core/CL/kernels/CLActivationLayerKernel.cpp | 67 +++++++++++++++++++--- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 51 +++++++++++----- src/runtime/CL/functions/CLActivationLayer.cpp | 2 +- src/runtime/NEON/functions/NEActivationLayer.cpp | 2 +- 9 files changed, 166 insertions(+), 47 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h index 490e70544b..df22574de8 100644 --- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h @@ -24,23 +24,45 @@ #ifndef __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__ #define __ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__ -#include "arm_compute/core/CL/ICLSimple3DKernel.h" +#include "arm_compute/core/CL/ICLKernel.h" namespace arm_compute { class ICLTensor; /** Interface for the activation layer kernel. */ -class CLActivationLayerKernel : public ICLSimple3DKernel +class CLActivationLayerKernel : public ICLKernel { public: + /** Default constructor */ + CLActivationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLActivationLayerKernel(const CLActivationLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLActivationLayerKernel &operator=(const CLActivationLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + CLActivationLayerKernel(CLActivationLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + CLActivationLayerKernel &operator=(CLActivationLayerKernel &&) = default; + /** Default destructor */ + ~CLActivationLayerKernel() = default; /** Set the input and output tensor. * - * @param[in] input Source tensor. Data types supported: F16, F32, U16, S16. - * @param[out] output Destination tensor. Data type should match the input data type. - * @param[in] act_info Activation layer information. + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: F16/F32. + * @param[out] output Destination tensor. Data type should match the input data type. + * @param[in] act_info Activation layer information. */ - void configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); + void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; }; } #endif /*__ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 97f92d6a1e..539bca587a 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -25,14 +25,14 @@ #define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ #include "arm_compute/core/FixedPoint.h" -#include "arm_compute/core/NEON/INESimpleKernel.h" +#include "arm_compute/core/NEON/INEKernel.h" namespace arm_compute { class ITensor; /** Interface for the activation layer kernel. */ -class NEActivationLayerKernel : public INESimpleKernel +class NEActivationLayerKernel : public INEKernel { public: /** Constructor */ @@ -47,11 +47,14 @@ public: NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default; /** Set the input and output tensor. * - * @param[in] input Source tensor. Data types supported: QS8/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input - * @param[in] activation_info Activation layer information. + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: QS8/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] activation_info Activation layer information. */ - void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info); + void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info); // Inherited methods overridden: void run(const Window &window) override; @@ -77,6 +80,8 @@ private: typename std::enable_if::value, void>::type activation(const Window &window); private: + ITensor *_input; + ITensor *_output; ActivationFunctionExecutorPtr _func; ActivationLayerInfo _act_info; }; diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h index 6468c996a2..3028afb25b 100644 --- a/arm_compute/runtime/CL/functions/CLActivationLayer.h +++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h @@ -41,11 +41,14 @@ class CLActivationLayer : public ICLSimpleFunction public: /** Set the input and output tensor. * - * @param[in] input Source tensor. Data types supported: F16, F32, U16, S16. - * @param[out] output Destination tensor. Data type should match the input data type. - * @param[in] act_info Activation layer parameters. + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: F16/F32. + * @param[out] output Destination tensor. Data type should match the input data type. + * @param[in] act_info Activation layer parameters. */ - void configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); + void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); }; } #endif /* __ARM_COMPUTE_CLACTIVATIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h index 35366e16fb..b1a211553d 100644 --- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h @@ -41,11 +41,14 @@ class NEActivationLayer : public INESimpleFunction public: /** Set the input and output tensor. * - * @param[in] input Source tensor. Data type supported: QS8/F32. - * @param[out] output Destination tensor. Data type supported: same as @p input - * @param[in] activation_info Activation layer parameters. + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: QS8/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] activation_info Activation layer parameters. */ - void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info); + void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info); }; } #endif /* __ARM_COMPUTE_NEACTIVATIONLAYER_H__ */ diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl index e3cbb6c801..136191aa22 100644 --- a/src/core/CL/cl_kernels/activation_layer.cl +++ b/src/core/CL/cl_kernels/activation_layer.cl @@ -24,6 +24,8 @@ #include "helpers.h" /** This performs an activation function floating point inputs. + * + * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * @note Activation function should be given as a preprocessor argument using -DNAME. e.g. -DTANH @@ -48,12 +50,20 @@ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image */ __kernel void activation_layer( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) + TENSOR3D_DECLARATION(input) +#if !defined IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif +) { // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#if defined IN_PLACE + Tensor3D output = input; +#else Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif // Load data VEC_DATA_TYPE(DATA_TYPE, 16) @@ -63,7 +73,7 @@ __kernel void activation_layer( #if defined LOGISTIC data = 1 / (1 + exp(-data)); #elif defined TANH - data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data); + data = (VEC_DATA_TYPE(DATA_TYPE, 16))A * tanh((VEC_DATA_TYPE(DATA_TYPE, 16))B * data); #elif defined RELU data = max(0, data); #elif defined BRELU diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp index 83bbe6a3be..6439426e83 100644 --- a/src/core/CL/kernels/CLActivationLayerKernel.cpp +++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp @@ -35,17 +35,24 @@ using namespace arm_compute; -void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +CLActivationLayerKernel::CLActivationLayerKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + if(output != nullptr) + { + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } // Set build options std::set build_opts; @@ -54,11 +61,55 @@ void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *outpu build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); build_opts.insert(("-DA=" + val_to_string(act_info.a()))); build_opts.insert(("-DB=" + val_to_string(act_info.b()))); + build_opts.insert(output == nullptr ? "-DIN_PLACE" : ""); // Create kernel _kernel = static_cast(CLKernelLibrary::get().create_kernel("activation_layer", build_opts)); // Make sure _kernel is initialized before calling the parent's configure constexpr unsigned int num_elems_processed_per_iteration = 16; - ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration); + + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + if(output != nullptr) + { + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + } + else + { + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + } + + ICLKernel::configure(win); +} + +void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + if(_output != nullptr) + { + add_3D_tensor_argument(idx, _output, slice); + } + enqueue(queue, *this, slice); + } + while(window.slide_window_slice_3D(slice)); } diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index a878078007..1bd0353b93 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -41,21 +41,29 @@ using namespace arm_compute; NEActivationLayerKernel::NEActivationLayerKernel() - : _func(nullptr), _act_info(ActivationFunction::LOGISTIC) + : _input(nullptr), _output(nullptr), _func(nullptr), _act_info(ActivationFunction::LOGISTIC) { } -void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info) +void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); - ARM_COMPUTE_ERROR_ON_NULLPTR(output); - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + _input = input; + _act_info = activation_info; + _output = input; + + if(output != nullptr) + { + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + _output = output; + } // Activation functions : FP32 static std::map act_map_f32 = @@ -85,9 +93,6 @@ void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, A { ActivationFunction::TANH, &NEActivationLayerKernel::activation }, }; - _input = input; - _output = output; - _act_info = activation_info; switch(input->info()->data_type()) { case DataType::F32: @@ -102,7 +107,27 @@ void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, A constexpr unsigned int num_elems_processed_per_iteration = 16; - INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration); + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + if(output != nullptr) + { + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + } + else + { + // In-place computation + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + } + + ICPPKernel::configure(win); } template @@ -295,7 +320,7 @@ typename std::enable_if::value, void>::type NEActivation void NEActivationLayerKernel::run(const Window &window) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); (this->*_func)(window); diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp index 9b5bd8b663..0d0da0ca9a 100644 --- a/src/runtime/CL/functions/CLActivationLayer.cpp +++ b/src/runtime/CL/functions/CLActivationLayer.cpp @@ -28,7 +28,7 @@ using namespace arm_compute; -void CLActivationLayer::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) +void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info) { auto k = arm_compute::cpp14::make_unique(); k->configure(input, output, act_info); diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index f5d81d7cd8..447ae6411b 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -28,7 +28,7 @@ using namespace arm_compute; -void NEActivationLayer::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info) +void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info) { auto k = arm_compute::cpp14::make_unique(); k->configure(input, output, activation_info); -- cgit v1.2.1