From 583137cc60580023abfd9d05abf933e7e117e29f Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 31 Aug 2017 18:12:42 +0100
Subject: COMPMID-417: Add support for floats in scale.

Change-Id: I7d714ba13861509080a89817f54e9d32da83e970
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86026
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
---
 arm_compute/core/CL/kernels/CLScaleKernel.h   |  8 ++--
 arm_compute/core/Helpers.h                    | 66 +++++++++++++++++++++++----
 arm_compute/core/Helpers.inl                  | 49 --------------------
 arm_compute/core/NEON/kernels/NEScaleKernel.h |  4 +-
 arm_compute/core/PixelValue.h                 | 38 ++++++---------
 arm_compute/core/Types.h                      |  4 ++
 arm_compute/runtime/CL/functions/CLScale.h    |  6 +--
 arm_compute/runtime/NEON/functions/NEScale.h  |  6 +--
 8 files changed, 87 insertions(+), 94 deletions(-)

(limited to 'arm_compute')
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index e74a7cb82a..0a4bbf0b5a 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -31,16 +31,14 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the warp affine kernel.*/
+/** Interface for the scale kernel */
 class CLScaleKernel : public ICLSimple2DKernel
 {
 public:
     /** Initialise the kernel's inputs, output and interpolation policy
      *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8, S16.
-     * @param[out] output           Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     * @param[in]  input            Source tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output           Destination tensor. Data types supported: Same as @p input
      *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index b6461bc47a..6e4d987180 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -26,12 +26,14 @@
 
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+
 #include <array>
 #include <cstddef>
 #include <cstdint>
@@ -82,9 +84,9 @@ struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>
 }
 
 /** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates.
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
  *
- * @param[in] pixel_ptr Pointer to the top-left pixel value. Format: Single channel U8
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
  * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
  * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
  * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
@@ -93,26 +95,57 @@ struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>
  *
  * @return The bilinear interpolated pixel value
  */
-inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy);
+template <typename T>
+inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+    const T a10 = *(pixel_ptr + stride);
+    const T a11 = *(pixel_ptr + stride + 1);
+
+    const float w1 = dx1 * dy1;
+    const float w2 = dx * dy1;
+    const float w3 = dx1 * dy;
+    const float w4 = dx * dy;
 
-/** Return the pixel at (x,y) using bilinear interpolation. The image must be single channel U8
+    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation.
  *
  * @warning Only works if the iterator was created with an IImage
  *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
  * @param[in] stride          Stride in bytes of the image;
  * @param[in] x               X position of the wanted pixel
  * @param[in] y               Y position of the wanted pixel
  *
  * @return The pixel at (x, y) using bilinear interpolation.
  */
-inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y);
+template <typename T>
+inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    const int32_t xi = std::floor(x);
+    const int32_t yi = std::floor(y);
 
-/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel U8
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
  *
  * @warning Only works if the iterator was created with an IImage
  *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
  * @param[in] stride          Stride in bytes of the image
  * @param[in] width           Width of the image
  * @param[in] height          Height of the image
@@ -121,7 +154,22 @@ inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride
  *
  * @return The pixel at (x, y) using bilinear interpolation.
  */
-inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y);
+template <typename T>
+inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
+    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
+
+    const float xi = std::floor(x);
+    const float yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
+}
 
 /** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
  *
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index c2ca3b44b3..de6c85ec76 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -29,55 +29,6 @@
 
 namespace arm_compute
 {
-inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = *pixel_ptr;
-    const float a01 = *(pixel_ptr + 1);
-    const float a10 = *(pixel_ptr + stride);
-    const float a11 = *(pixel_ptr + stride + 1);
-
-    const float w1 = dx1 * dy1;
-    const float w2 = dx * dy1;
-    const float w3 = dx1 * dy;
-    const float w4 = dx * dy;
-
-    return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-}
-
-inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    const int32_t xi = std::floor(x);
-    const int32_t yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1u8(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
-}
-
-inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const float xi = std::floor(x);
-    const float yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1u8(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
-}
-
 inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index 3cac023d8a..660ecfdf72 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -52,11 +52,11 @@ public:
      *
      * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
      *
-     * @param[in]  input            Source tensor. Data types supported: U8/S16.
+     * @param[in]  input            Source tensor. Data types supported: U8/S16/F32.
      * @param[in]  dx               Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
      * @param[in]  dy               Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
      * @param[in]  offsets          Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] output           Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[out] output           Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 1b1a5a3845..63405560ea 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -24,11 +24,9 @@
 #ifndef __ARM_COMPUTE_PIXELVALUE_H__
 #define __ARM_COMPUTE_PIXELVALUE_H__
 
-#include <cstdint>
+#include "arm_compute/core/Types.h"
 
-#if ARM_COMPUTE_ENABLE_FP16
-#include <arm_fp16.h> // needed for float16_t
-#endif                /* ARM_COMPUTE_ENABLE_FP16 */
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -86,17 +84,15 @@ public:
     {
         value.s32 = v;
     }
-#if ARM_COMPUTE_ENABLE_FP16
     /** Initialize the union with a F16 pixel value
      *
      * @param[in] v F16 value.
      */
-    PixelValue(float16_t v)
+    PixelValue(half v)
         : PixelValue()
     {
         value.f16 = v;
     }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
     /** Initialize the union with a F32 pixel value
      *
      * @param[in] v F32 value.
@@ -111,19 +107,17 @@ public:
      */
     union
         {
-            uint8_t rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */
-            float   f32;     /**< Single channel float 32 */
-#if ARM_COMPUTE_ENABLE_FP16
-            float16_t f16; /**< Single channel F16 */
-#endif                 /* ARM_COMPUTE_ENABLE_FP16 */
-            uint8_t  u8;   /**< Single channel U8 */
-            int8_t   s8;   /**< Single channel S8 */
-            uint16_t u16;  /**< Single channel U16 */
-            int16_t  s16;  /**< Single channel S16 */
-            uint32_t u32;  /**< Single channel U32 */
-            int32_t  s32;  /**< Single channel S32 */
+            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+            float    f32;     /**< Single channel float 32 */
+            half     f16;     /**< Single channel F16 */
+            uint8_t  u8;      /**< Single channel U8 */
+            int8_t   s8;      /**< Single channel S8 */
+            uint16_t u16;     /**< Single channel U16 */
+            int16_t  s16;     /**< Single channel S16 */
+            uint32_t u32;     /**< Single channel U32 */
+            int32_t  s32;     /**< Single channel S32 */
         } value;
     /** Interpret the pixel value as a U8
      *
@@ -173,16 +167,14 @@ public:
     {
         v = value.s32;
     }
-#if ARM_COMPUTE_ENABLE_FP16
     /** Interpret the pixel value as a F16
      *
      * @param[out] v Returned value
      */
-    void get(float16_t &v) const
+    void get(half &v) const
     {
         v = value.f16;
     }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
     /** Interpret the pixel value as a F32
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index b90798e5ff..7d9cd4e0cc 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/TensorShape.h"
+#include "support/Half.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -34,6 +35,9 @@
 
 namespace arm_compute
 {
+/** 16-bit floating point type */
+using half = half_float::half;
+
 /** Image colour formats */
 enum class Format
 {
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index c2438ddf9b..db491c1a44 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -39,14 +39,14 @@ class CLScale : public ICLSimpleFunction
 public:
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in,out] input                 Source tensor. Data types supported: U8, S16. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     * @param[in,out] input                 Source tensor. Data types supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: Same as @p input
      *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     policy                The interpolation type.
      * @param[in]     border_mode           Strategy to use for borders.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue());
 };
 }
 #endif /*__ARM_COMPUTE_CLSCALE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 00a368eb72..7297880a7a 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -47,13 +47,13 @@ public:
     NEScale();
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in, out] input                 Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      policy                The interpolation type.
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run() override;
-- 
cgit v1.2.1