COMPMID-909: Enabling in-place computation for batchnormalization and activation at graph level

Change-Id: I84d4a212629b21794451ab5fb5c5b187b5e28f98 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120127 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2018-02-13 15:24:04 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:47:18 +0000
commit: dde9ec96f471127e5b6d8dfaeffce024b6326f1a (patch)
tree: 3aa88c0dec625feeb9d17da825b87398cac6cc68
parent: e3fba0afa892c66379da1e3d3843f2155a1fb29a (diff)
download: ComputeLibrary-dde9ec96f471127e5b6d8dfaeffce024b6326f1a.tar.gz
17 files changed, 74 insertions, 32 deletions
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
index fbe450c4f2..c6d8f96a87 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
@@ -73,6 +73,7 @@ public:
 private:
     ICLTensor *_input;
     ICLTensor *_output;
+    bool       _run_in_place;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index fee5dd3bae..e9fd564fbd 100644
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
@@ -96,6 +96,7 @@ private:
     const ICLTensor *_beta;
     const ICLTensor *_gamma;
     float            _epsilon;
+    bool             _run_in_place;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index 56b50b9424..1969423074 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,6 +58,16 @@ public:
      * @return The updated target hint
      */
     TargetHint override_target_hint(TargetHint target_hint) const;
+    /** Method to check if the node supports in-place operations.
+     *
+     * @return True if the node supports in-place operations, false otherwise.
+     */
+    virtual bool supports_in_place() const;
+    /** Set the value of the _supports_in_place attribute.
+     *
+     * @param[in] value Boolean value to assign to _supports_in_place.
+     */
+    void set_supports_in_place(bool value);
 
 protected:
     /** Interface to be implement that override the hints
@@ -70,6 +80,7 @@ protected:
 
 protected:
     TargetHint _target_hint{ TargetHint::DONT_CARE };
+    bool       _supports_in_place{ false };
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/NodeContext.h b/arm_compute/graph/NodeContext.h
index bc90f217a7..17ae49740b 100644
--- a/arm_compute/graph/NodeContext.h
+++ b/arm_compute/graph/NodeContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,7 @@ public:
      * @param[in] input Input to add
      */
     void add_input(arm_compute::ITensor *input);
-    /** Adds and output to the context
+    /** Adds an output to the context
      *
      * @param[in] output Output to add
      */
diff --git a/arm_compute/graph/nodes/BatchNormalizationLayer.h b/arm_compute/graph/nodes/BatchNormalizationLayer.h
index 266c3905d8..abbf09a54e 100644
--- a/arm_compute/graph/nodes/BatchNormalizationLayer.h
+++ b/arm_compute/graph/nodes/BatchNormalizationLayer.h
@@ -51,6 +51,7 @@ public:
     BatchNormalizationLayer(AccessorType &&mean, AccessorType &&var, AccessorType &&gamma, AccessorType &&beta, float epsilon, ActivationLayerInfo act_info = ActivationLayerInfo())
         : _mean(std::move(mean)), _var(std::move(var)), _gamma(std::move(gamma)), _beta(std::move(beta)), _epsilon(epsilon), _act_info(act_info)
     {
+        set_supports_in_place(true);
     }
 
     // Inherited methods overriden:
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index 5b99abc5fb..8cb41d225a 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ class CLActivationLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensor.
      *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
      *                          of the activation function. Data types supported: QS8/QS16/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index 3d5145a697..39f567d6a3 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -46,7 +46,7 @@ public:
     CLBatchNormalizationLayer();
     /** Set the input and output tensors.
      *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+     * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 007c53a0a8..31714216fb 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ class NEActivationLayer : public INESimpleFunction
 public:
     /** Set the input and output tensor.
      *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
      *                                 of the activation function. Data types supported: QS8/QS16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index 242144c987..85c62663ab 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -46,7 +46,7 @@ public:
     NEBatchNormalizationLayer();
     /** Set the input and output tensors.
      *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
+     * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 8d4c0b82d2..a78b3e1b93 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -96,7 +96,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLActivationLayerKernel::CLActivationLayerKernel()
-    : _input(nullptr), _output(nullptr)
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
 }
 
@@ -104,6 +104,8 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
+    _run_in_place = (output == nullptr) || (output == input);
+
     if(output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
@@ -147,12 +149,15 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
         build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
         build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
 
+        const int o1 = input->info()->quantization_info().offset;
+        // Quantized value of 0 corresponds to the offset o1
+        build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
+
         // Set scale and offset of the input and output if they have different quantization info
         if(is_data_type_quantized_asymmetric(dt) && output != nullptr)
         {
             const float s1 = input->info()->quantization_info().scale;
             const float s2 = output->info()->quantization_info().scale;
-            const int   o1 = input->info()->quantization_info().offset;
             const int   o2 = output->info()->quantization_info().offset;
 
             if(o1 != o2 || s1 != s2)
@@ -162,9 +167,6 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
                 build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
                 build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
             }
-
-            // Quantized value of 0 corresponds to the offset o1
-            build_opts.emplace(("-DCONST_0=" + support::cpp11::to_string(o1)));
         }
     }
     else
@@ -173,7 +175,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
         build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
     }
 
-    build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
+    build_opts.emplace((_run_in_place) ? "-DIN_PLACE" : "");
     if(is_data_type_fixed_point(dt))
     {
         build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
@@ -188,7 +190,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     _output = output;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -203,8 +205,9 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
 
 Status CLActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
+    const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
 
     return Status{};
 }
@@ -221,7 +224,7 @@ void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(_output != nullptr)
+        if(!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 95487a23db..87fc1d097c 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -101,7 +101,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
 {
 }
 
@@ -118,6 +118,8 @@ void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *out
     _gamma   = gamma;
     _epsilon = epsilon;
 
+    _run_in_place = (output == nullptr) || (output == input);
+
     if(output != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
@@ -137,19 +139,19 @@ void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *out
     build_opts.add_option_if(act_info.enabled(), "-D" + string_from_activation_func(act_info.activation()));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
-    build_opts.add_option_if(output == nullptr, "-DIN_PLACE");
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
     build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts.options()));
 
     // Set kernel static arguments
-    unsigned int include_output = (output != nullptr) ? 1 : 0;
+    unsigned int include_output = (!_run_in_place) ? 1 : 0;
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -168,8 +170,9 @@ Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const
                                                  const ITensorInfo *beta, const ITensorInfo *gamma,
                                                  float epsilon, ActivationLayerInfo act_info)
 {
+    const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
 
     return Status{};
 }
@@ -184,7 +187,7 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    unsigned int include_output = (_output != nullptr) ? 1 : 0;
+    unsigned int include_output = (!_run_in_place) ? 1 : 0;
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
@@ -195,7 +198,7 @@ void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(_output != nullptr)
+        if(!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 7af313acbb..98d95904dc 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -131,6 +131,11 @@ void Graph::Private::configure(GraphHints _next_hints)
         _previous_hints = _current_hints; // For the first node just assume the previous node was of the same type as this one
     }
 
+    if(_current_node->supports_in_place())
+    {
+        _current_output = _current_input;
+    }
+
     //Automatic output configuration ?
     if(_current_output == nullptr)
     {
@@ -152,8 +157,12 @@ void Graph::Private::configure(GraphHints _next_hints)
     _ctx.hints()                                 = _current_hints;
     std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input, _current_output);
 
-    // Allocate current input
-    _current_input->allocate();
+    // If the operation is done in-place, do not allocate or it will prevent following layers from performing the configuration
+    if(!_current_node->supports_in_place())
+    {
+        // Allocate current input
+        _current_input->allocate();
+    }
 
     // Map input if needed
     if(_current_input->target() == TargetHint::OPENCL)
diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 582f936351..c753f66b43 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,14 @@ TargetHint INode::override_target_hint(TargetHint target_hint) const
     ARM_COMPUTE_ERROR_ON(target_hint == TargetHint::OPENCL && !opencl_is_available());
     return target_hint;
 }
+bool INode::supports_in_place() const
+{
+    return _supports_in_place;
+}
+void INode::set_supports_in_place(bool value)
+{
+    _supports_in_place = value;
+}
 GraphHints INode::node_override_hints(GraphHints hints) const
 {
     TargetHint target_hint = hints.target_hint();
diff --git a/src/graph/SubGraph.cpp b/src/graph/SubGraph.cpp
index f62b2617c5..b1cbb9cc95 100644
--- a/src/graph/SubGraph.cpp
+++ b/src/graph/SubGraph.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,6 +67,10 @@ std::unique_ptr<Graph> SubGraph::construct(const GraphContext &ctx, std::unique_
     }
     graph->add_tensor_object(std::move(_input));
 
+    // Make sure first and last nodes of the subgraph always do operations out-of-place
+    _nodes.front()->set_supports_in_place(false);
+    _nodes.back()->set_supports_in_place(false);
+
     // Construct nodes
     for(auto &node : _nodes)
     {
diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
index 54f30ef777..546c42a1e5 100644
--- a/src/graph/nodes/ActivationLayer.cpp
+++ b/src/graph/nodes/ActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@ using namespace arm_compute::graph;
 ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
     : _activation_info(activation_info)
 {
+    set_supports_in_place(true);
 }
 
 std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index eaf2ca586c..4aeb3a15e1 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index cdf1b54659..6af71a3580 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2018-02-13 15:24:04 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:47:18 +0000
commit	dde9ec96f471127e5b6d8dfaeffce024b6326f1a (patch)
tree	3aa88c0dec625feeb9d17da825b87398cac6cc68
parent	e3fba0afa892c66379da1e3d3843f2155a1fb29a (diff)
download	ComputeLibrary-dde9ec96f471127e5b6d8dfaeffce024b6326f1a.tar.gz