From e29acf14f5c3f2d2c20799a1ea3e4aad50dff834 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Mon, 16 Jul 2018 14:40:09 +0100
Subject: COMPMID-1365: Add support for NHWC in CLDepthConcatenateLayer

Change-Id: I3ed55bdb95d888aff0b0b76fb841bf1669659308
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/139963
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 .../CL/kernels/CLDepthConcatenateLayerKernel.h     | 11 ++-
 .../CL/kernels/CLWidthConcatenateLayerKernel.h     |  2 +-
 arm_compute/core/Utils.h                           | 31 ---------
 arm_compute/core/utils/misc/ShapeCalculator.h      | 25 +++++++
 arm_compute/runtime/CL/CLFunctions.h               |  1 +
 .../runtime/CL/functions/CLConcatenateLayer.h      | 81 ++++++++++++++++++++++
 .../runtime/CL/functions/CLDepthConcatenateLayer.h | 26 ++++++-
 .../runtime/CL/functions/CLWidthConcatenateLayer.h | 14 ++--
 8 files changed, 153 insertions(+), 38 deletions(-)
 create mode 100644 arm_compute/runtime/CL/functions/CLConcatenateLayer.h

(limited to 'arm_compute')
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
index cbcab8f554..ff8009085f 100644
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
@@ -52,7 +52,7 @@ public:
     ~CLDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: F16/F32.
+     * @param[in]     input        Input tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in]     depth_offset The offset on the Z axis.
      * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
      *
@@ -61,6 +61,15 @@ public:
      *
      */
     void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
+     *
+     * @param[in] input        Input tensor info. Data types supported: QASYMM8/F16/F32
+     * @param[in] depth_offset The offset on the Z axis.
+     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
index d206eb0da7..7ecd9276aa 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
@@ -58,7 +58,7 @@ public:
      *
      */
     void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
+    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] width_offset The offset on the X axis.
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 729a46fe3f..1cdfd389db 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -630,37 +630,6 @@ inline uint32_t calculate_matrix_scale(const int16_t *matrix, unsigned int matri
     return std::max(1, std::abs(std::accumulate(matrix, matrix + size, 0)));
 }
 
-/** Calculate the output shapes of the depth concatenate function.
- *
- * @param[in] inputs_vector The vector that stores all the pointers to input.
- *
- * @return the output shape
- */
-template <typename T>
-TensorShape calculate_depth_concatenate_shape(const std::vector<T *> &inputs_vector)
-{
-    TensorShape out_shape = inputs_vector[0]->info()->tensor_shape();
-
-    size_t max_x = 0;
-    size_t max_y = 0;
-    size_t depth = 0;
-
-    for(const auto &tensor : inputs_vector)
-    {
-        ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-        const TensorShape shape = tensor->info()->tensor_shape();
-        max_x                   = std::max(shape.x(), max_x);
-        max_y                   = std::max(shape.y(), max_y);
-        depth += shape.z();
-    }
-
-    out_shape.set(0, max_x);
-    out_shape.set(1, max_y);
-    out_shape.set(2, depth);
-
-    return out_shape;
-}
-
 /** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
  *
  * @note Adding here a few links discussing the issue of odd size and sharing the same solution:
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 9bf6b046b4..e5516ba154 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -413,6 +413,31 @@ inline TensorShape get_shape_from_info(ITensorInfo *info)
     return info->tensor_shape();
 }
 
+template <typename T>
+inline TensorShape calculate_depth_concatenate_shape(const std::vector<T *> &inputs_vector)
+{
+    TensorShape out_shape = get_shape_from_info(inputs_vector[0]);
+
+    size_t max_x = 0;
+    size_t max_y = 0;
+    size_t depth = 0;
+
+    for(const auto &tensor : inputs_vector)
+    {
+        ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+        const TensorShape shape = get_shape_from_info(tensor);
+        max_x                   = std::max(shape.x(), max_x);
+        max_y                   = std::max(shape.y(), max_y);
+        depth += shape.z();
+    }
+
+    out_shape.set(0, max_x);
+    out_shape.set(1, max_y);
+    out_shape.set(2, depth);
+
+    return out_shape;
+}
+
 template <typename T>
 inline TensorShape calculate_width_concatenate_shape(const std::vector<T *> &inputs_vector)
 {
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 0b69c96673..5e42715c2f 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -42,6 +42,7 @@
 #include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 #include "arm_compute/runtime/CL/functions/CLColorConvert.h"
+#include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
new file mode 100644
index 0000000000..018c58942f
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLCONCATENATELAYER_H__
+#define __ARM_COMPUTE_CLCONCATENATELAYER_H__
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+// Forward declarations
+class ICLTensor;
+class ITensorInfo;
+class Status;
+
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref CLWidthConcatenateLayer (if underlying concatenation axis is 0).
+ * -# @ref CLDepthConcatenateLayer (if underlying concatenation axis is 2).
+ */
+class CLConcatenateLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    CLConcatenateLayer();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayer and @ref CLDepthConcatenateLayer.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
+     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
+     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0 and 2.
+     */
+    void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output, DataLayoutDimension axis);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayer and @ref CLDepthConcatenateLayer.
+     *
+     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/F16/F32.
+     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
+     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0 and 2.
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, DataLayoutDimension axis);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<IFunction> _concat_function;
+};
+}
+#endif /* __ARM_COMPUTE_CLCONCATENATELAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
index d505814e73..bafce1c66f 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h
@@ -52,10 +52,34 @@ public:
     CLDepthConcatenateLayer();
     /** Initialise the kernel's inputs vector and output.
      *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F16/F32.
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
+     *                              Input dimensions might differ for each input for the first three dimensions (width, height, depth)
+     *                              and must match for the rest.
+     *                              Note that the difference between the minimum and maximum width and height among the input tensors
+     *                              must be divisible by 2 otherwise it is not clear how padding should be added on the inputs' width and
+     *                              height when they are less than the maximum input sizes.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
+     *                              Output tensor dimensions match the inputs' ones from the fourth dimension and above,
+     *                              while width and height are the maximum width and height of the input tensors.
+     *                              Finally, depth is the sum of the input depths.
      */
     void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayer
+     *
+     * @param[in] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
+     *                          Input dimensions might differ for each input for the first three dimensions (width, height, depth)
+     *                          and must match for the rest.
+     *                          Note that the difference between the minimum and maximum width and height among the input tensors
+     *                          must be divisible by 2 otherwise it is not clear how padding should be added on the inputs' width and
+     *                          height when they are less than the maximum input sizes.
+     * @param[in] output        Output tensor. Data types supported: Same as @p input.
+     *                          Output tensor dimensions match the inputs' ones from the fourth dimension and above,
+     *                          while width and height are the maximum width and height of the input tensors.
+     *                          Finally, depth is the sum of the input depths.
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
index 289191e030..44462b02b2 100644
--- a/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWidthConcatenateLayer.h
@@ -50,14 +50,20 @@ public:
     CLWidthConcatenateLayer();
     /** Initialise the kernel's inputs vector and output.
      *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
-     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
+     * @param[in]  inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
+     *                           Dimensions of all the inputs should match apart for the width which can differ.
+     * @param[out] output        Output tensor. Data types supported: Same as @p input.
+     *                           Output tensor dimensions are the same with the inputs from the second dimension and above.
+     *                           The first dimension (width) is the sum of the input tensors' widths.
      */
     void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
      *
-     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/F16/F32.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
+     * @param[in] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/F16/F32.
+     *                          Dimensions of all the inputs should match apart for the width which can differ.
+     * @param[in] output        Output tensor. Data types supported: Same as @p input.
+     *                          Output tensor dimensions are the same with the inputs from the second dimension and above.
+     *                          The first dimension (width) is the sum of the input tensors' widths.
      *
      * @return a status
      */
-- 
cgit v1.2.1