From ac4e873dad6aa6291fc36aff62047a896db04f6a Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 5 Jul 2017 17:02:25 +0100
Subject: COMPMID-417: Port DepthConcatenate to QS8/QS16 for NEON/CL.

Change-Id: I3dddae63043c7aa18d908a4fc8abacf3c64f98ca
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80081
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Steven Niu <steven.niu@arm.com>
---
 arm_compute/core/CL/CLHelpers.h                        |  8 ++++++++
 arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h |  4 ++--
 .../core/NEON/kernels/NEDepthConcatenateKernel.h       | 18 +++++++++++-------
 arm_compute/runtime/CL/functions/CLDepthConcatenate.h  |  9 +++++----
 .../runtime/NEON/functions/NEDepthConcatenate.h        |  9 +++++----
 5 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'arm_compute')

diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 01980d9793..eeb3e7699d 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -53,6 +53,14 @@ static constexpr const unsigned int max_cl_vector_width = 16;
  */
 std::string get_cl_type_from_data_type(const DataType &dt);
 
+/** Translates fixed point tensor data type to the underlying OpenCL type.
+ *
+ * @param[in] dt @ref DataType to be translated to OpenCL type.
+ *
+ * @return The string specifying the underlying OpenCL type to be used.
+ */
+std::string get_underlying_cl_type_from_data_type(const DataType &dt);
+
 /** Translates a given gpu device target to string.
  *
  * @param[in] target Given gpu target.
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
index eda4c66883..e85e0ec232 100644
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h
@@ -52,9 +52,9 @@ public:
     ~CLDepthConcatenateKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     input        Input tensor. Data types supported: QS8/QS16/F16/F32.
      * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: F32.
+     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
      *
      * @note: The output tensor's low two dimensions can't be smaller than the input one's.
      * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
index 7384cd1f02..b22d37bfe6 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h
@@ -51,9 +51,9 @@ public:
     ~NEDepthConcatenateKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: F32.
+     * @param[in]     input        Input tensor. Data types supported: QS8/QS16/F16/F32.
      * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: F32.
+     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
      *
      * @note: The output tensor's low two dimensions can't be smaller than the input one's.
      * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
@@ -66,11 +66,15 @@ public:
     BorderSize border_size() const override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    int            _top_bottom;
-    int            _left_right;
-    unsigned int   _depth_offset;
+    using DepthConcatFunction = void(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window);
+
+private:
+    DepthConcatFunction *_func;
+    const ITensor       *_input;
+    ITensor             *_output;
+    int                  _top_bottom;
+    int                  _left_right;
+    unsigned int         _depth_offset;
 };
 }
 #endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
index 3199936b82..77997f6bd1 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h
@@ -29,14 +29,15 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+
 #include <memory>
 #include <vector>
 
 namespace arm_compute
 {
 class ICLTensor;
-class CLDepthConcatenateKernel;
-class CLFillBorderKernel;
 
 /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
  *
@@ -51,8 +52,8 @@ public:
     CLDepthConcatenate();
     /** Initialise the kernel's inputs vector and output.
      *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
-     * @param[out]    output        Output tensor. Data types supported: F32.
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QS8/QS16/F16/F32.
+     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      */
     void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output);
 
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
index 02ff1227c7..cc65099575 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h
@@ -26,14 +26,15 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+
 #include <memory>
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
-class NEDepthConcatenateKernel;
-class NEFillBorderKernel;
 
 /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels:
  *
@@ -48,8 +49,8 @@ public:
     NEDepthConcatenate();
     /** Initialise the kernel's inputs vector and output.
      *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  F32.
-     * @param[out]    output        Output tensor. Data types supported: F32.
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported:  QS8/QS16/F16/F32.
+     * @param[out]    output        Output tensor. Data types supported: Same as @p inputs_vector.
      */
     void configure(std::vector<ITensor *> inputs_vector, ITensor *output);
 
-- 
cgit v1.2.1