70 files changed, 627 insertions, 364 deletions
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index e74a7cb82a..0a4bbf0b5a 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -31,16 +31,14 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the warp affine kernel.*/
+/** Interface for the scale kernel */
 class CLScaleKernel : public ICLSimple2DKernel
 {
 public:
     /** Initialise the kernel's inputs, output and interpolation policy
      *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8, S16.
-     * @param[out] output           Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     * @param[in]  input            Source tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output           Destination tensor. Data types supported: Same as @p input
      *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index b6461bc47a..6e4d987180 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -26,12 +26,14 @@
 
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
+
 #include <array>
 #include <cstddef>
 #include <cstdint>
@@ -82,9 +84,9 @@ struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>
 }
 
 /** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates.
+ * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
  *
- * @param[in] pixel_ptr Pointer to the top-left pixel value. Format: Single channel U8
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
  * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
  * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
  * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
@@ -93,26 +95,57 @@ struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>
  *
  * @return The bilinear interpolated pixel value
  */
-inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy);
+template <typename T>
+inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const T a00 = *pixel_ptr;
+    const T a01 = *(pixel_ptr + 1);
+    const T a10 = *(pixel_ptr + stride);
+    const T a11 = *(pixel_ptr + stride + 1);
+
+    const float w1 = dx1 * dy1;
+    const float w2 = dx * dy1;
+    const float w3 = dx1 * dy;
+    const float w4 = dx * dy;
 
-/** Return the pixel at (x,y) using bilinear interpolation. The image must be single channel U8
+    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation.
  *
  * @warning Only works if the iterator was created with an IImage
  *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
  * @param[in] stride          Stride in bytes of the image;
  * @param[in] x               X position of the wanted pixel
  * @param[in] y               Y position of the wanted pixel
  *
  * @return The pixel at (x, y) using bilinear interpolation.
  */
-inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y);
+template <typename T>
+inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    const int32_t xi = std::floor(x);
+    const int32_t yi = std::floor(y);
 
-/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel U8
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
+}
+
+/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
  *
  * @warning Only works if the iterator was created with an IImage
  *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
+ * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
  * @param[in] stride          Stride in bytes of the image
  * @param[in] width           Width of the image
  * @param[in] height          Height of the image
@@ -121,7 +154,22 @@ inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride
  *
  * @return The pixel at (x, y) using bilinear interpolation.
  */
-inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y);
+template <typename T>
+inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
+{
+    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
+
+    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
+    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
+
+    const float xi = std::floor(x);
+    const float yi = std::floor(y);
+
+    const float dx = x - xi;
+    const float dy = y - yi;
+
+    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
+}
 
 /** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
  *
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index c2ca3b44b3..de6c85ec76 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -29,55 +29,6 @@
 
 namespace arm_compute
 {
-inline uint8_t delta_bilinear_c1u8(const uint8_t *pixel_ptr, size_t stride, float dx, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = *pixel_ptr;
-    const float a01 = *(pixel_ptr + 1);
-    const float a10 = *(pixel_ptr + stride);
-    const float a11 = *(pixel_ptr + stride + 1);
-
-    const float w1 = dx1 * dy1;
-    const float w2 = dx * dy1;
-    const float w3 = dx1 * dy;
-    const float w4 = dx * dy;
-
-    return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-}
-
-inline uint8_t pixel_bilinear_c1u8(const uint8_t *first_pixel_ptr, size_t stride, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    const int32_t xi = std::floor(x);
-    const int32_t yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1u8(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
-}
-
-inline uint8_t pixel_bilinear_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const float xi = std::floor(x);
-    const float yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1u8(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
-}
-
 inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index 3cac023d8a..660ecfdf72 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -52,11 +52,11 @@ public:
      *
      * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
      *
-     * @param[in]  input            Source tensor. Data types supported: U8/S16.
+     * @param[in]  input            Source tensor. Data types supported: U8/S16/F32.
      * @param[in]  dx               Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
      * @param[in]  dy               Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
      * @param[in]  offsets          Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] output           Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[out] output           Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  policy           Interpolation type to use
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 1b1a5a3845..63405560ea 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -24,11 +24,9 @@
 #ifndef __ARM_COMPUTE_PIXELVALUE_H__
 #define __ARM_COMPUTE_PIXELVALUE_H__
 
-#include <cstdint>
+#include "arm_compute/core/Types.h"
 
-#if ARM_COMPUTE_ENABLE_FP16
-#include <arm_fp16.h> // needed for float16_t
-#endif                /* ARM_COMPUTE_ENABLE_FP16 */
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -86,17 +84,15 @@ public:
     {
         value.s32 = v;
     }
-#if ARM_COMPUTE_ENABLE_FP16
     /** Initialize the union with a F16 pixel value
      *
      * @param[in] v F16 value.
      */
-    PixelValue(float16_t v)
+    PixelValue(half v)
         : PixelValue()
     {
         value.f16 = v;
     }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
     /** Initialize the union with a F32 pixel value
      *
      * @param[in] v F32 value.
@@ -111,19 +107,17 @@ public:
      */
     union
         {
-            uint8_t rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t rgbx[4]; /**< 4 channels: RGBX8888 */
-            float   f32;     /**< Single channel float 32 */
-#if ARM_COMPUTE_ENABLE_FP16
-            float16_t f16; /**< Single channel F16 */
-#endif                 /* ARM_COMPUTE_ENABLE_FP16 */
-            uint8_t  u8;   /**< Single channel U8 */
-            int8_t   s8;   /**< Single channel S8 */
-            uint16_t u16;  /**< Single channel U16 */
-            int16_t  s16;  /**< Single channel S16 */
-            uint32_t u32;  /**< Single channel U32 */
-            int32_t  s32;  /**< Single channel S32 */
+            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+            float    f32;     /**< Single channel float 32 */
+            half     f16;     /**< Single channel F16 */
+            uint8_t  u8;      /**< Single channel U8 */
+            int8_t   s8;      /**< Single channel S8 */
+            uint16_t u16;     /**< Single channel U16 */
+            int16_t  s16;     /**< Single channel S16 */
+            uint32_t u32;     /**< Single channel U32 */
+            int32_t  s32;     /**< Single channel S32 */
         } value;
     /** Interpret the pixel value as a U8
      *
@@ -173,16 +167,14 @@ public:
     {
         v = value.s32;
     }
-#if ARM_COMPUTE_ENABLE_FP16
     /** Interpret the pixel value as a F16
      *
      * @param[out] v Returned value
      */
-    void get(float16_t &v) const
+    void get(half &v) const
     {
         v = value.f16;
     }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
     /** Interpret the pixel value as a F32
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index b90798e5ff..7d9cd4e0cc 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/TensorShape.h"
+#include "support/Half.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -34,6 +35,9 @@
 
 namespace arm_compute
 {
+/** 16-bit floating point type */
+using half = half_float::half;
+
 /** Image colour formats */
 enum class Format
 {
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index c2438ddf9b..db491c1a44 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -39,14 +39,14 @@ class CLScale : public ICLSimpleFunction
 public:
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in,out] input                 Source tensor. Data types supported: U8, S16. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8, S16 (Must be the same as the input tensor).
+     * @param[in,out] input                 Source tensor. Data types supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output                Destination tensor. Data types supported: Same as @p input
      *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     policy                The interpolation type.
      * @param[in]     border_mode           Strategy to use for borders.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue());
 };
 }
 #endif /*__ARM_COMPUTE_CLSCALE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 00a368eb72..7297880a7a 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -47,13 +47,13 @@ public:
     NEScale();
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in, out] input                 Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      policy                The interpolation type.
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index d2610539d1..2e066c7753 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 23ce89aba2..66afc3db60 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -46,9 +46,10 @@ BorderSize CLScaleKernel::border_size() const
 
 void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
 
     _input  = input;
     _output = output;
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 3f1f678a7e..9505a2520f 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -27,16 +27,13 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <algorithm>
 #include <cstdint>
 
-#if ARM_COMPUTE_ENABLE_FP16
-#include <arm_fp16.h> // needed for float16_t
-#endif                /* ARM_COMPUTE_ENABLE_FP16 */
-
 using namespace arm_compute;
 
 namespace
@@ -163,18 +160,20 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                 case DataType::S32:
                     fill_constant_value_single_channel<int32_t>(window);
                     break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
                 case DataType::F16:
-                    static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
-                    fill_constant_value_single_channel<float16_t>(window);
+                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+                    fill_constant_value_single_channel<half>(window);
                     break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     if(_border_size.left == 1 && _border_size.top == 1)
+                    {
                         fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                    }
                     else
+                    {
                         fill_constant_value_single_channel<float>(window);
+                    }
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Not handled");
@@ -205,12 +204,10 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
                 case DataType::S32:
                     fill_replicate_single_channel<int32_t>(window);
                     break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
                 case DataType::F16:
-                    static_assert(sizeof(float16_t) == 2, "Float16_t must be 16 bit");
-                    fill_replicate_single_channel<float16_t>(window);
+                    static_assert(sizeof(half) == 2, "Float16_t must be 16 bit");
+                    fill_replicate_single_channel<half>(window);
                     break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
                 case DataType::F32:
                     static_assert(sizeof(float) == 4, "Float must be 32 bit");
                     fill_replicate_single_channel<float>(window);
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 2dbabca2e3..83004aedc1 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp
@@ -192,24 +192,24 @@ void NERemapKernel::remap_bilinear(const Window &window)
         const uint8_t *in_ptr   = in.ptr();
 
         uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
+        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
 
         uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
+        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
 
         vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
     },
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 7ec4212227..6634d4b13c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -50,8 +50,10 @@ BorderSize NEScaleKernel::border_size() const
 
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
 
     if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
@@ -243,6 +245,50 @@ void NEScaleKernel::scale_nearest(const Window &window)
             in, offsets, out);
             break;
         }
+        case DataType::F32:
+        {
+            float32x4x4_t tmp =
+            {
+                {
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0)
+                }
+            };
+
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
         default:
             ARM_COMPUTE_ERROR("Not supported");
             break;
@@ -251,7 +297,7 @@ void NEScaleKernel::scale_nearest(const Window &window)
 
 void NEScaleKernel::scale_bilinear(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F32);
 
     // Compute the ratio between source height and destination height
     const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -278,41 +324,140 @@ void NEScaleKernel::scale_bilinear(const Window &window)
     Iterator dy(_dy, win_off);
 
     /* Input image stride */
-    const size_t in_stride = _input->info()->strides_in_bytes()[1];
+    const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
+    const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    switch(_input->info()->data_type())
     {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-        const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-        const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-        const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+        case DataType::U8:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
+
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                uint8x8_t tmp0 = vdup_n_u8(0);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+
+                uint8x8_t tmp1 = vdup_n_u8(0);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+
+                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        case DataType::S16:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-        const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
-        const int offset_row = in_yi * in_stride;
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
 
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+                int16x8x2_t tmp =
+                {
+                    {
+                        vdupq_n_s16(0),
+                        vdupq_n_s16(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
+                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
+
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
+                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
 
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        case DataType::F32:
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
+                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    in, offsets, dx, dy, out);
+                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int offset_row = in_yi * in_stide_in_bytes;
+
+                float32x4x4_t tmp =
+                {
+                    {
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0),
+                        vdupq_n_f32(0)
+                    }
+                };
+
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, dx, dy, out);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
 }
 
 void NEScaleKernel::scale_area(const Window &window)
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index b13e99e800..62f4e5d057 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp
@@ -177,7 +177,7 @@ void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -256,7 +256,7 @@ void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -336,7 +336,7 @@ void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, x0, y0);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -445,7 +445,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -546,7 +546,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
@@ -678,7 +678,7 @@ void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window
                     *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
                     break;
                 case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn);
+                    *out.ptr() = pixel_bilinear_c1(in.ptr(), stride, xn, yn);
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Interpolation not supported");
diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 05522b4af5..49b0275019 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp
@@ -31,13 +31,8 @@
 
 using namespace arm_compute;
 
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
     k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
     _kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 0b17e80ee7..bbd3fac63f 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -95,7 +95,7 @@ NEScale::NEScale() // NOLINT
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
diff --git a/tests/validation/half.h b/support/Half.h
index 0ca620cf57..0bc5e6268e 100644
--- a/tests/validation/half.h
+++ b/support/Half.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_TEST_HALF_H__
-#define __ARM_COMPUTE_TEST_HALF_H__
+#ifndef __ARM_COMPUTE_HALF_H__
+#define __ARM_COMPUTE_HALF_H__
 
 #ifdef __ANDROID__
 // Android toolchain is broken and doesn't support all CPP11 math functions.
@@ -35,4 +35,4 @@
 
 #include "half/half.hpp"
 
-#endif /* __ARM_COMPUTE_TEST_HALF_H__ */
+#endif /* __ARM_COMPUTE_HALF_H__ */
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index 8370702554..bd1d8b4758 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -35,7 +35,6 @@
 #include "tests/RawTensor.h"
 #include "tests/TensorCache.h"
 #include "tests/Utils.h"
-#include "tests/validation/half.h"
 
 #include <algorithm>
 #include <cstddef>
diff --git a/tests/Utils.h b/tests/Utils.h
index f325bb3f37..3861577d31 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h
@@ -31,7 +31,6 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
-#include "tests/validation/half.h"
 
 #include <cmath>
 #include <cstddef>
@@ -99,7 +98,7 @@ template <> struct promote<int16_t> { using type = int32_t; };
 template <> struct promote<uint32_t> { using type = uint64_t; };
 template <> struct promote<int32_t> { using type = int64_t; };
 template <> struct promote<float> { using type = float; };
-template <> struct promote<half_float::half> { using type = half_float::half; };
+template <> struct promote<half> { using type = half; };
 
 
 template <typename T>
@@ -253,7 +252,7 @@ void store_value_with_data_type(void *ptr, T value, DataType data_type)
             *reinterpret_cast<int64_t *>(ptr) = value;
             break;
         case DataType::F16:
-            *reinterpret_cast<half_float::half *>(ptr) = value;
+            *reinterpret_cast<half *>(ptr) = value;
             break;
         case DataType::F32:
             *reinterpret_cast<float *>(ptr) = value;
diff --git a/tests/validation/CL/ActivationLayer.cpp b/tests/validation/CL/ActivationLayer.cpp
index 097fb638f9..83bd2d0a3a 100644
--- a/tests/validation/CL/ActivationLayer.cpp
+++ b/tests/validation/CL/ActivationLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ActivationLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -162,16 +161,16 @@ using CLActivationLayerFixture = ActivationValidationFixture<CLTensor, CLAccesso
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLActivationLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ActivationDataset),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLActivationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ActivationDataset),
+                                                                                                            framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance(_function, _data_type));
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLActivationLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ActivationDataset),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLActivationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ActivationDataset),
+                                                                                                          framework::dataset::make("DataType",
+                                                                                                                  DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance(_function, _data_type));
diff --git a/tests/validation/CL/ArithmeticAddition.cpp b/tests/validation/CL/ArithmeticAddition.cpp
index 2bc2a9b2d0..5089ecabcf 100644
--- a/tests/validation/CL/ArithmeticAddition.cpp
+++ b/tests/validation/CL/ArithmeticAddition.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ArithmeticAdditionFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -180,8 +179,8 @@ TEST_SUITE_END()
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<half_float::half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
+                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/ArithmeticSubtraction.cpp b/tests/validation/CL/ArithmeticSubtraction.cpp
index e2bc4b2d79..817a31fbd9 100644
--- a/tests/validation/CL/ArithmeticSubtraction.cpp
+++ b/tests/validation/CL/ArithmeticSubtraction.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ArithmeticSubtractionFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -182,8 +181,8 @@ TEST_SUITE_END()
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<half_float::half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
-                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 860b92bb3c..b2fd22eaee 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ConvolutionLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -44,10 +43,10 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float>            tolerance_f32(0.001f);                /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-constexpr AbsoluteTolerance<float>  tolerance_q(1.0f);                    /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
-constexpr float                     tolerance_num = 0.07f;                /**< Tolerance number */
+RelativeTolerance<float>           tolerance_f32(0.001f);    /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */
+RelativeTolerance<half>            tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
+constexpr AbsoluteTolerance<float> tolerance_q(1.0f);        /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+constexpr float                    tolerance_num = 0.07f;    /**< Tolerance number */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -103,18 +102,18 @@ using CLConvolutionLayerFixture = ConvolutionValidationFixture<CLTensor, CLAcces
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLConvolutionLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
-                       framework::dataset::make("ReshapeWeights", { true, false })),
-                       framework::dataset::make("DataType",
-                                                DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                             framework::dataset::make("DataType",
+                                                                                                                     DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                                       framework::dataset::make("DataType",
-                                                                                                                               DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                           framework::dataset::make("DataType",
+                                                                                                                   DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp
index 8c738747e3..19a8b369ce 100644
--- a/tests/validation/CL/DepthConcatenateLayer.cpp
+++ b/tests/validation/CL/DepthConcatenateLayer.cpp
@@ -32,7 +32,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DepthConcatenateLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -50,14 +49,14 @@ using CLDepthConcatenateLayerFixture = DepthConcatenateValidationFixture<CLTenso
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConcatenateLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                       DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLDepthConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConcatenateLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
-                       DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLDepthConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::Large2DShapes(), framework::dataset::make("DataType",
+                                                                                                                DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/DepthConvert.cpp b/tests/validation/CL/DepthConvert.cpp
index 6abb482f49..57669f0a52 100644
--- a/tests/validation/CL/DepthConvert.cpp
+++ b/tests/validation/CL/DepthConvert.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DepthConvertFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index 553286a6a5..25e881f4ce 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DirectConvolutionLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -44,9 +43,9 @@ namespace validation
 namespace
 {
 // COMPMID-517 Invesitgate the mismatch to see whether it is a real bug
-RelativeTolerance<half_float::half> tolerance_fp16(half_float::half(0.2)); /**< Tolerance for floating point tests */
-RelativeTolerance<float>            tolerance_fp32(0.02f);                 /**< Tolerance for floating point tests */
-constexpr float                     tolerance_num = 0.07f;                 /**< Tolerance number */
+RelativeTolerance<half>  tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */
+RelativeTolerance<float> tolerance_fp32(0.02f);     /**< Tolerance for floating point tests */
+constexpr float          tolerance_num = 0.07f;     /**< Tolerance number */
 
 constexpr AbsoluteTolerance<int8_t>  tolerance_qs8(0);  /**< Tolerance for fixed point tests */
 constexpr AbsoluteTolerance<int16_t> tolerance_qs16(0); /**< Tolerance for fixed point tests */
@@ -85,7 +84,7 @@ using CLDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<CLTen
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(data, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num);
diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index 356e9677fc..22f27e56dd 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/FullyConnectedLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -44,9 +43,9 @@ namespace validation
 namespace
 {
 /** Tolerance for float operations */
-RelativeTolerance<float>            tolerance_f32(0.001f);
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2));
-constexpr float                     tolerance_num = 0.07f; /**< Tolerance number */
+RelativeTolerance<float> tolerance_f32(0.001f);
+RelativeTolerance<half>  tolerance_f16(half(0.2));
+constexpr float          tolerance_num = 0.07f; /**< Tolerance number */
 
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<float> tolerance_fixed_point(1.f);
@@ -109,16 +108,16 @@ using CLFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<CLTens
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       FullyConnectedParameters),
-                       framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters),
+                                                                                                                framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                       FullyConnectedParameters),
-                       framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters),
+                                                                                                              framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
diff --git a/tests/validation/CL/GEMM.cpp b/tests/validation/CL/GEMM.cpp
index e9414bf247..854551917e 100644
--- a/tests/validation/CL/GEMM.cpp
+++ b/tests/validation/CL/GEMM.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/GEMMFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -44,9 +43,9 @@ namespace validation
 {
 namespace
 {
-RelativeTolerance<float>            tolerance_f32(0.001f);                /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
-constexpr AbsoluteTolerance<float>  tolerance_q(1.0f);                    /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
+RelativeTolerance<float>           tolerance_f32(0.001f);    /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+RelativeTolerance<half>            tolerance_f16(half(0.2)); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+constexpr AbsoluteTolerance<float> tolerance_q(1.0f);        /**< Tolerance value for comparing reference's output against implementation's output for fixed point data types */
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -90,13 +89,13 @@ using CLGEMMFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), framework::dataset::make("DataType",
-                                                                                                           DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), framework::dataset::make("DataType",
+                                                                                               DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/CL/L2Normalize.cpp b/tests/validation/CL/L2Normalize.cpp
index bd9bf17322..4b0820c211 100644
--- a/tests/validation/CL/L2Normalize.cpp
+++ b/tests/validation/CL/L2Normalize.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/L2NormalizeFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/NormalizationLayer.cpp b/tests/validation/CL/NormalizationLayer.cpp
index 1a029cf924..4fca6bf297 100644
--- a/tests/validation/CL/NormalizationLayer.cpp
+++ b/tests/validation/CL/NormalizationLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/NormalizationLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -45,8 +44,8 @@ namespace validation
 namespace
 {
 /** Tolerance for float operations */
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2));
-RelativeTolerance<float>            tolerance_f32(0.05f);
+RelativeTolerance<half>  tolerance_f16(half(0.2));
+RelativeTolerance<float> tolerance_f32(0.05f);
 
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<int8_t>  tolerance_qs8(2);
@@ -68,12 +67,12 @@ using CLNormalizationLayerFixture = NormalizationValidationFixture<CLTensor, CLA
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLNormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index 24380cb1f0..4dca93a9ac 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/PoolingLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -81,14 +80,14 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::Datase
 TEST_SUITE_END()
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                               framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                   framework::dataset::make("DataType", DataType::F16))))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                                   framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                       framework::dataset::make("DataType", DataType::F16))))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 37a21b4ff8..684ed4694f 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ReductionOperationFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CL/ReshapeLayer.cpp b/tests/validation/CL/ReshapeLayer.cpp
index 95388c92b8..57027a9801 100644
--- a/tests/validation/CL/ReshapeLayer.cpp
+++ b/tests/validation/CL/ReshapeLayer.cpp
@@ -57,8 +57,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLReshapeLayerFixture<float>, framework::Datase
 TEST_SUITE_END()
 
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLReshapeLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType",
-                                                                                                               DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLReshapeLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallReshapeLayerDataset(), framework::dataset::make("DataType",
+                                                                                                   DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index d5866fad97..014bb0070c 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp
@@ -45,14 +45,25 @@ namespace validation
 {
 namespace
 {
+/** CNN data types */
+const auto ScaleDataTypes = framework::dataset::make("DataType",
+{
+    DataType::U8,
+    DataType::S16,
+    DataType::F16,
+    DataType::F32,
+});
+
 /** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance(1);
+RelativeTolerance<float>             tolerance_f32(0.01);
+RelativeTolerance<half>              tolerance_f16(half(0.1));
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(Scale)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", DataType::U8)),
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), ScaleDataTypes),
                                                                            framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                    datasets::BorderModes()),
                shape, data_type, policy, border_mode)
@@ -96,11 +107,63 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combi
 template <typename T>
 using CLScaleFixture = ScaleValidationFixture<CLTensor, CLAccessor, CLScale, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                             framework::dataset::make("InterpolationPolicy",
-{ InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-datasets::BorderModes()))
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
+                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                             datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
+                                                                                                         framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                 datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32);
+}
+TEST_SUITE_END()
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
+                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                            datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                DataType::F16)),
+                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE(Integer)
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::U8)),
+                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                               datasets::BorderModes()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -109,11 +172,9 @@ datasets::BorderModes()))
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                           framework::dataset::make("InterpolationPolicy",
-{ InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-datasets::BorderModes()))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::U8)),
+                                                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                   datasets::BorderModes()))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -123,6 +184,33 @@ datasets::BorderModes()))
     validate(CLAccessor(_target), _reference, valid_region, tolerance);
 }
 TEST_SUITE_END()
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::S16)),
+                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                               datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::S16)),
+                                                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                   datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 8c143ecd96..c469b8acd8 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/SoftmaxLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -44,8 +43,8 @@ namespace validation
 namespace
 {
 /** Tolerance for float operations */
-RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.2));
-RelativeTolerance<float>            tolerance_f32(0.001f);
+RelativeTolerance<half>  tolerance_f16(half(0.2));
+RelativeTolerance<float> tolerance_f32(0.001f);
 
 /** Tolerance for fixed point operations */
 constexpr AbsoluteTolerance<int16_t> tolerance_fixed_point(2);
@@ -95,12 +94,12 @@ using CLSoftmaxLayerFixture = SoftmaxValidationFixture<CLTensor, CLAccessor, CLS
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/CPP/ActivationLayer.cpp b/tests/validation/CPP/ActivationLayer.cpp
index 8fcacca1e2..2243e6ff59 100644
--- a/tests/validation/CPP/ActivationLayer.cpp
+++ b/tests/validation/CPP/ActivationLayer.cpp
@@ -23,9 +23,9 @@
  */
 #include "ActivationLayer.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -155,7 +155,7 @@ SimpleTensor<T> activation_layer(const SimpleTensor<T> &src, ActivationLayerInfo
 }
 
 template SimpleTensor<float> activation_layer(const SimpleTensor<float> &src, ActivationLayerInfo info);
-template SimpleTensor<half_float::half> activation_layer(const SimpleTensor<half_float::half> &src, ActivationLayerInfo info);
+template SimpleTensor<half> activation_layer(const SimpleTensor<half> &src, ActivationLayerInfo info);
 template SimpleTensor<qint8_t> activation_layer(const SimpleTensor<qint8_t> &src, ActivationLayerInfo info);
 template SimpleTensor<qint16_t> activation_layer(const SimpleTensor<qint16_t> &src, ActivationLayerInfo info);
 } // namespace reference
diff --git a/tests/validation/CPP/ArithmeticAddition.cpp b/tests/validation/CPP/ArithmeticAddition.cpp
index 41052c469b..82dd1437cd 100644
--- a/tests/validation/CPP/ArithmeticAddition.cpp
+++ b/tests/validation/CPP/ArithmeticAddition.cpp
@@ -23,9 +23,9 @@
  */
 #include "ArithmeticAddition.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -54,8 +54,8 @@ SimpleTensor<T> arithmetic_addition(const SimpleTensor<T> &src1, const SimpleTen
 template SimpleTensor<uint8_t> arithmetic_addition(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int16_t> arithmetic_addition(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int8_t> arithmetic_addition(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<half_float::half> arithmetic_addition(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, DataType dst_data_type,
-                                                            ConvertPolicy convert_policy);
+template SimpleTensor<half> arithmetic_addition(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type,
+                                                ConvertPolicy convert_policy);
 template SimpleTensor<float> arithmetic_addition(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/CPP/ArithmeticSubtraction.cpp b/tests/validation/CPP/ArithmeticSubtraction.cpp
index fa7fec9d1b..80bdb15a49 100644
--- a/tests/validation/CPP/ArithmeticSubtraction.cpp
+++ b/tests/validation/CPP/ArithmeticSubtraction.cpp
@@ -25,7 +25,6 @@
 
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -54,8 +53,7 @@ SimpleTensor<T> arithmetic_subtraction(const SimpleTensor<T> &src1, const Simple
 template SimpleTensor<uint8_t> arithmetic_subtraction(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int16_t> arithmetic_subtraction(const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int8_t> arithmetic_subtraction(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
-template SimpleTensor<half_float::half> arithmetic_subtraction(const SimpleTensor<half_float::half> &src1, const SimpleTensor<half_float::half> &src2, DataType dst_data_type,
-                                                               ConvertPolicy convert_policy);
+template SimpleTensor<half> arithmetic_subtraction(const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<float> arithmetic_subtraction(const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/CPP/ConvolutionLayer.cpp b/tests/validation/CPP/ConvolutionLayer.cpp
index 1824ada791..656cd2ee26 100644
--- a/tests/validation/CPP/ConvolutionLayer.cpp
+++ b/tests/validation/CPP/ConvolutionLayer.cpp
@@ -25,7 +25,6 @@
 
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -193,8 +192,8 @@ SimpleTensor<T> convolution_layer(const SimpleTensor<T> &src, const SimpleTensor
 
 template SimpleTensor<float> convolution_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &output_shape,
                                                const PadStrideInfo &info);
-template SimpleTensor<half_float::half> convolution_layer(const SimpleTensor<half_float::half> &src, const SimpleTensor<half_float::half> &weights, const SimpleTensor<half_float::half> &bias,
-                                                          const TensorShape &output_shape, const PadStrideInfo &info);
+template SimpleTensor<half> convolution_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &output_shape,
+                                              const PadStrideInfo &info);
 template SimpleTensor<qint8_t> convolution_layer(const SimpleTensor<qint8_t> &src, const SimpleTensor<qint8_t> &weights, const SimpleTensor<qint8_t> &bias, const TensorShape &output_shape,
                                                  const PadStrideInfo &info);
 template SimpleTensor<qint16_t> convolution_layer(const SimpleTensor<qint16_t> &src, const SimpleTensor<qint16_t> &weights, const SimpleTensor<qint16_t> &bias, const TensorShape &output_shape,
diff --git a/tests/validation/CPP/DepthConcatenateLayer.cpp b/tests/validation/CPP/DepthConcatenateLayer.cpp
index 139d26f2b6..9a7248493d 100644
--- a/tests/validation/CPP/DepthConcatenateLayer.cpp
+++ b/tests/validation/CPP/DepthConcatenateLayer.cpp
@@ -25,7 +25,6 @@
 
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -95,7 +94,7 @@ SimpleTensor<T> depthconcatenate_layer(const std::vector<SimpleTensor<T>> &srcs)
 }
 
 template SimpleTensor<float> depthconcatenate_layer(const std::vector<SimpleTensor<float>> &srcs);
-template SimpleTensor<half_float::half> depthconcatenate_layer(const std::vector<SimpleTensor<half_float::half>> &srcs);
+template SimpleTensor<half> depthconcatenate_layer(const std::vector<SimpleTensor<half>> &srcs);
 template SimpleTensor<qint8_t> depthconcatenate_layer(const std::vector<SimpleTensor<qint8_t>> &srcs);
 template SimpleTensor<qint16_t> depthconcatenate_layer(const std::vector<SimpleTensor<qint16_t>> &srcs);
 } // namespace reference
diff --git a/tests/validation/CPP/DepthConvert.cpp b/tests/validation/CPP/DepthConvert.cpp
index bb34f67086..110174a73f 100644
--- a/tests/validation/CPP/DepthConvert.cpp
+++ b/tests/validation/CPP/DepthConvert.cpp
@@ -25,7 +25,6 @@
 
 #include "tests/validation/FixedPoint.h"
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 #include "tests/Types.h"
 
diff --git a/tests/validation/CPP/DepthwiseConvolution.cpp b/tests/validation/CPP/DepthwiseConvolution.cpp
index ebca333715..ce30bed640 100644
--- a/tests/validation/CPP/DepthwiseConvolution.cpp
+++ b/tests/validation/CPP/DepthwiseConvolution.cpp
@@ -27,7 +27,6 @@
 #include "Utils.h"
 
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CPP/DepthwiseSeparableConvolutionLayer.cpp b/tests/validation/CPP/DepthwiseSeparableConvolutionLayer.cpp
index 7020a854cf..3942ecf02a 100644
--- a/tests/validation/CPP/DepthwiseSeparableConvolutionLayer.cpp
+++ b/tests/validation/CPP/DepthwiseSeparableConvolutionLayer.cpp
@@ -29,7 +29,6 @@
 #include "Utils.h"
 
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/CPP/FullyConnectedLayer.cpp b/tests/validation/CPP/FullyConnectedLayer.cpp
index a146535bd9..2b32c4b161 100644
--- a/tests/validation/CPP/FullyConnectedLayer.cpp
+++ b/tests/validation/CPP/FullyConnectedLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "FullyConnectedLayer.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
-#include "tests/validation/half.h"
 
 #include <numeric>
 
@@ -123,8 +123,7 @@ SimpleTensor<T> fully_connected_layer(const SimpleTensor<T> &src, const SimpleTe
 }
 
 template SimpleTensor<float> fully_connected_layer(const SimpleTensor<float> &src, const SimpleTensor<float> &weights, const SimpleTensor<float> &bias, const TensorShape &dst_shape);
-template SimpleTensor<half_float::half> fully_connected_layer(const SimpleTensor<half_float::half> &src, const SimpleTensor<half_float::half> &weights, const SimpleTensor<half_float::half> &bias,
-                                                              const TensorShape &dst_shape);
+template SimpleTensor<half> fully_connected_layer(const SimpleTensor<half> &src, const SimpleTensor<half> &weights, const SimpleTensor<half> &bias, const TensorShape &dst_shape);
 template SimpleTensor<qint8_t> fully_connected_layer(const SimpleTensor<qint8_t> &src, const SimpleTensor<qint8_t> &weights, const SimpleTensor<qint8_t> &bias, const TensorShape &dst_shape);
 template SimpleTensor<qint16_t> fully_connected_layer(const SimpleTensor<qint16_t> &src, const SimpleTensor<qint16_t> &weights, const SimpleTensor<qint16_t> &bias, const TensorShape &dst_shape);
 } // namespace reference
diff --git a/tests/validation/CPP/GEMM.cpp b/tests/validation/CPP/GEMM.cpp
index 9b66597eb8..77d025ec8e 100644
--- a/tests/validation/CPP/GEMM.cpp
+++ b/tests/validation/CPP/GEMM.cpp
@@ -23,8 +23,8 @@
  */
 #include "GEMM.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -113,7 +113,7 @@ SimpleTensor<T> gemm(const SimpleTensor<T> &a, const SimpleTensor<T> &b, const S
 }
 
 template SimpleTensor<float> gemm(const SimpleTensor<float> &a, const SimpleTensor<float> &b, const SimpleTensor<float> &c, float alpha, float beta);
-template SimpleTensor<half_float::half> gemm(const SimpleTensor<half_float::half> &a, const SimpleTensor<half_float::half> &b, const SimpleTensor<half_float::half> &c, float alpha, float beta);
+template SimpleTensor<half> gemm(const SimpleTensor<half> &a, const SimpleTensor<half> &b, const SimpleTensor<half> &c, float alpha, float beta);
 template SimpleTensor<qint8_t> gemm(const SimpleTensor<qint8_t> &a, const SimpleTensor<qint8_t> &b, const SimpleTensor<qint8_t> &c, float alpha, float beta);
 template SimpleTensor<qint16_t> gemm(const SimpleTensor<qint16_t> &a, const SimpleTensor<qint16_t> &b, const SimpleTensor<qint16_t> &c, float alpha, float beta);
 } // namespace reference
diff --git a/tests/validation/CPP/NormalizationLayer.cpp b/tests/validation/CPP/NormalizationLayer.cpp
index 3c6f5e1a54..226af96fe3 100644
--- a/tests/validation/CPP/NormalizationLayer.cpp
+++ b/tests/validation/CPP/NormalizationLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "NormalizationLayer.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -266,7 +266,7 @@ SimpleTensor<T> normalization_layer(const SimpleTensor<T> &src, NormalizationLay
 }
 
 template SimpleTensor<float> normalization_layer(const SimpleTensor<float> &src, NormalizationLayerInfo info);
-template SimpleTensor<half_float::half> normalization_layer(const SimpleTensor<half_float::half> &src, NormalizationLayerInfo info);
+template SimpleTensor<half> normalization_layer(const SimpleTensor<half> &src, NormalizationLayerInfo info);
 template SimpleTensor<qint8_t> normalization_layer(const SimpleTensor<qint8_t> &src, NormalizationLayerInfo info);
 template SimpleTensor<qint16_t> normalization_layer(const SimpleTensor<qint16_t> &src, NormalizationLayerInfo info);
 } // namespace reference
diff --git a/tests/validation/CPP/PoolingLayer.cpp b/tests/validation/CPP/PoolingLayer.cpp
index c4425ca9a1..f7273f073f 100644
--- a/tests/validation/CPP/PoolingLayer.cpp
+++ b/tests/validation/CPP/PoolingLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "PoolingLayer.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -234,7 +234,7 @@ SimpleTensor<T> pooling_layer(const SimpleTensor<T> &src, PoolingLayerInfo info)
 }
 
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, PoolingLayerInfo info);
-template SimpleTensor<half_float::half> pooling_layer(const SimpleTensor<half_float::half> &src, PoolingLayerInfo info);
+template SimpleTensor<half> pooling_layer(const SimpleTensor<half> &src, PoolingLayerInfo info);
 template SimpleTensor<qint8_t> pooling_layer(const SimpleTensor<qint8_t> &src, PoolingLayerInfo info);
 template SimpleTensor<qint16_t> pooling_layer(const SimpleTensor<qint16_t> &src, PoolingLayerInfo info);
 } // namespace reference
diff --git a/tests/validation/CPP/ReshapeLayer.cpp b/tests/validation/CPP/ReshapeLayer.cpp
index cc7f15e4b1..42f06e4f5a 100644
--- a/tests/validation/CPP/ReshapeLayer.cpp
+++ b/tests/validation/CPP/ReshapeLayer.cpp
@@ -23,7 +23,7 @@
  */
 #include "ReshapeLayer.h"
 
-#include "tests/validation/half.h"
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
@@ -49,7 +49,7 @@ template SimpleTensor<uint16_t> reshape_layer(const SimpleTensor<uint16_t> &src,
 template SimpleTensor<int16_t> reshape_layer(const SimpleTensor<int16_t> &src, const TensorShape &output_shape);
 template SimpleTensor<uint32_t> reshape_layer(const SimpleTensor<uint32_t> &src, const TensorShape &output_shape);
 template SimpleTensor<int32_t> reshape_layer(const SimpleTensor<int32_t> &src, const TensorShape &output_shape);
-template SimpleTensor<half_float::half> reshape_layer(const SimpleTensor<half_float::half> &src, const TensorShape &output_shape);
+template SimpleTensor<half> reshape_layer(const SimpleTensor<half> &src, const TensorShape &output_shape);
 template SimpleTensor<float> reshape_layer(const SimpleTensor<float> &src, const TensorShape &output_shape);
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/CPP/Scale.cpp b/tests/validation/CPP/Scale.cpp
index a1119f33b9..ba34553a99 100644
--- a/tests/validation/CPP/Scale.cpp
+++ b/tests/validation/CPP/Scale.cpp
@@ -36,7 +36,7 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value)
 {
     TensorShape shape_scaled(in.shape());
     shape_scaled.set(0, in.shape()[0] * scale_x);
@@ -160,6 +160,9 @@ SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, I
 }
 
 template SimpleTensor<uint8_t> scale(const SimpleTensor<uint8_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value);
+template SimpleTensor<int16_t> scale(const SimpleTensor<int16_t> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, int16_t constant_border_value);
+template SimpleTensor<half> scale(const SimpleTensor<half> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, half constant_border_value);
+template SimpleTensor<float> scale(const SimpleTensor<float> &src, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, float constant_border_value);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/CPP/Scale.h b/tests/validation/CPP/Scale.h
index b882915946..53183ae742 100644
--- a/tests/validation/CPP/Scale.h
+++ b/tests/validation/CPP/Scale.h
@@ -35,7 +35,7 @@ namespace validation
 namespace reference
 {
 template <typename T>
-SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
+SimpleTensor<T> scale(const SimpleTensor<T> &in, float scale_x, float scale_y, InterpolationPolicy policy, BorderMode border_mode, T constant_border_value = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/CPP/SoftmaxLayer.cpp b/tests/validation/CPP/SoftmaxLayer.cpp
index 4fe87d07dc..eb7655078c 100644
--- a/tests/validation/CPP/SoftmaxLayer.cpp
+++ b/tests/validation/CPP/SoftmaxLayer.cpp
@@ -23,8 +23,8 @@
  */
 #include "SoftmaxLayer.h"
 
+#include "arm_compute/core/Types.h"
 #include "tests/validation/FixedPoint.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -113,7 +113,7 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src)
 }
 
 template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src);
-template SimpleTensor<half_float::half> softmax_layer(const SimpleTensor<half_float::half> &src);
+template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src);
 template SimpleTensor<qint8_t> softmax_layer(const SimpleTensor<qint8_t> &src);
 template SimpleTensor<qint16_t> softmax_layer(const SimpleTensor<qint16_t> &src);
 } // namespace reference
diff --git a/tests/validation/CPP/Utils.cpp b/tests/validation/CPP/Utils.cpp
index 15e9fc3138..2f54879818 100644
--- a/tests/validation/CPP/Utils.cpp
+++ b/tests/validation/CPP/Utils.cpp
@@ -24,7 +24,6 @@
 #include "Utils.h"
 
 #include "tests/validation/Helpers.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -51,17 +50,20 @@ T tensor_elem_at(const SimpleTensor<T> &in, Coordinates coord, BorderMode border
         }
         else
         {
-            return constant_border_value;
+            return static_cast<T>(constant_border_value);
         }
     }
     return in[coord2index(in.shape(), coord)];
 }
-template float tensor_elem_at(const SimpleTensor<float> &in, Coordinates coord, BorderMode border_mode, float constant_border_value);
+
 template uint8_t tensor_elem_at(const SimpleTensor<uint8_t> &in, Coordinates coord, BorderMode border_mode, uint8_t constant_border_value);
+template int16_t tensor_elem_at(const SimpleTensor<int16_t> &in, Coordinates coord, BorderMode border_mode, int16_t constant_border_value);
+template half tensor_elem_at(const SimpleTensor<half> &in, Coordinates coord, BorderMode border_mode, half constant_border_value);
+template float tensor_elem_at(const SimpleTensor<float> &in, Coordinates coord, BorderMode border_mode, float constant_border_value);
 
 // Return the bilinear value at a specified coordinate with different border modes
 template <typename T>
-T bilinear_policy(const SimpleTensor<T> &in, Coordinates id, float xn, float yn, BorderMode border_mode, uint8_t constant_border_value)
+T bilinear_policy(const SimpleTensor<T> &in, Coordinates id, float xn, float yn, BorderMode border_mode, T constant_border_value)
 {
     int idx = std::floor(xn);
     int idy = std::floor(yn);
@@ -71,22 +73,28 @@ T bilinear_policy(const SimpleTensor<T> &in, Coordinates id, float xn, float yn,
     const float dx_1 = 1.0f - dx;
     const float dy_1 = 1.0f - dy;
 
+    const T border_value = constant_border_value;
+
     id.set(0, idx);
     id.set(1, idy);
-    const T tl = tensor_elem_at(in, id, border_mode, constant_border_value);
+    const float tl = tensor_elem_at(in, id, border_mode, border_value);
     id.set(0, idx + 1);
     id.set(1, idy);
-    const T tr = tensor_elem_at(in, id, border_mode, constant_border_value);
+    const float tr = tensor_elem_at(in, id, border_mode, border_value);
     id.set(0, idx);
     id.set(1, idy + 1);
-    const T bl = tensor_elem_at(in, id, border_mode, constant_border_value);
+    const float bl = tensor_elem_at(in, id, border_mode, border_value);
     id.set(0, idx + 1);
     id.set(1, idy + 1);
-    const T br = tensor_elem_at(in, id, border_mode, constant_border_value);
+    const float br = tensor_elem_at(in, id, border_mode, border_value);
 
-    return tl * (dx_1 * dy_1) + tr * (dx * dy_1) + bl * (dx_1 * dy) + br * (dx * dy);
+    return static_cast<T>(tl * (dx_1 * dy_1) + tr * (dx * dy_1) + bl * (dx_1 * dy) + br * (dx * dy));
 }
+
 template uint8_t bilinear_policy(const SimpleTensor<uint8_t> &in, Coordinates id, float xn, float yn, BorderMode border_mode, uint8_t constant_border_value);
+template int16_t bilinear_policy(const SimpleTensor<int16_t> &in, Coordinates id, float xn, float yn, BorderMode border_mode, int16_t constant_border_value);
+template half bilinear_policy(const SimpleTensor<half> &in, Coordinates id, float xn, float yn, BorderMode border_mode, half constant_border_value);
+template float bilinear_policy(const SimpleTensor<float> &in, Coordinates id, float xn, float yn, BorderMode border_mode, float constant_border_value);
 
 /* Apply 2D spatial filter on a single element of @p in at coordinates @p coord
  *
diff --git a/tests/validation/CPP/Utils.h b/tests/validation/CPP/Utils.h
index 34ba60bed6..557d85f204 100644
--- a/tests/validation/CPP/Utils.h
+++ b/tests/validation/CPP/Utils.h
@@ -45,7 +45,7 @@ template <typename T>
 T tensor_elem_at(const SimpleTensor<T> &in, Coordinates coord, BorderMode border_mode, T constant_border_value);
 
 template <typename T>
-T bilinear_policy(const SimpleTensor<T> &in, Coordinates id, float xn, float yn, BorderMode border_mode, uint8_t constant_border_value);
+T bilinear_policy(const SimpleTensor<T> &in, Coordinates id, float xn, float yn, BorderMode border_mode, T constant_border_value);
 
 template <typename T1, typename T2, typename T3>
 void apply_2d_spatial_filter(Coordinates coord, const SimpleTensor<T1> &in, SimpleTensor<T3> &out, const TensorShape &filter_shape, const T2 *filter_itr, float scale, BorderMode border_mode,
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 5fdd57f7ba..7043f1ebdc 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "tests/Globals.h"
-#include "tests/validation/half.h"
 
 #include <random>
 #include <type_traits>
@@ -45,7 +44,7 @@ struct is_floating_point : public std::is_floating_point<T>
 };
 
 template <>
-struct is_floating_point<half_float::half> : public std::true_type
+struct is_floating_point<half> : public std::true_type
 {
 };
 
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 1c6811fd1c..f58f3a8338 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ActivationLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -144,16 +143,16 @@ using NEActivationLayerFixture = ActivationValidationFixture<Tensor, Accessor, N
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ActivationDataset),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ActivationDataset),
+                                                                                                            framework::dataset::make("DataType",
+                                                                                                                    DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance(_data_type, _function));
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEActivationLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ActivationDataset),
-                                                                                                                      framework::dataset::make("DataType",
-                                                                                                                              DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEActivationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ActivationDataset),
+                                                                                                          framework::dataset::make("DataType",
+                                                                                                                  DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance(_data_type, _function));
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index 4cc589c71a..edc59a909f 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ArithmeticAdditionFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -183,8 +182,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half_float::half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP16Dataset),
+                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index 124f05a470..3d184f13f8 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ArithmeticSubtractionFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -185,8 +184,8 @@ TEST_SUITE_END()
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half_float::half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
-                                                                                                                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 5880a7eda2..f74925c82b 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ConvolutionLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -107,16 +106,16 @@ using NEConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor,
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEConvolutionLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
-                       framework::dataset::make("ReshapeWeights", { true, false })),
-                       framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                                                                     framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                             framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
-                                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),
-                                                                                                                       framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeConvolutionLayerDataset(),
+                                                                                                                   framework::dataset::make("ReshapeWeights", { true, false })),
+                                                                                                           framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/DepthConcatenateLayer.cpp b/tests/validation/NEON/DepthConcatenateLayer.cpp
index c98d36bf8d..c500041a7c 100644
--- a/tests/validation/NEON/DepthConcatenateLayer.cpp
+++ b/tests/validation/NEON/DepthConcatenateLayer.cpp
@@ -32,7 +32,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DepthConcatenateLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -51,14 +50,14 @@ using NEDepthConcatenateLayerFixture = DepthConcatenateValidationFixture<Tensor,
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConcatenateLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
-                       DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthConcatenateLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::Small2DShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConcatenateLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::DepthConcatenateShapes(), framework::dataset::make("DataType",
-                       DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthConcatenateLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::DepthConcatenateShapes(), framework::dataset::make("DataType",
+                                                                                                                DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference);
diff --git a/tests/validation/NEON/DepthConvert.cpp b/tests/validation/NEON/DepthConvert.cpp
index acc9d0f18b..e036cc45d1 100644
--- a/tests/validation/NEON/DepthConvert.cpp
+++ b/tests/validation/NEON/DepthConvert.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DepthConvertFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 6211d31c45..9349ff8db8 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DirectConvolutionLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -100,7 +99,7 @@ using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tenso
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(data_f32, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(data_f32, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index 55f8da97cf..9c354123a3 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/FullyConnectedLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -122,16 +121,16 @@ using NEFullyConnectedLayerFixture = FullyConnectedLayerValidationFixture<Tensor
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallFullyConnectedLayerDataset(),
-                       FullyConnectedParameters),
-                       framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallFullyConnectedLayerDataset(),
+                                                                                                                        FullyConnectedParameters),
+                                                                                                                framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeFullyConnectedLayerDataset(),
-                       FullyConnectedParameters),
-                       framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEFullyConnectedLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeFullyConnectedLayerDataset(),
+                                                                                                                      FullyConnectedParameters),
+                                                                                                              framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index 05db3bbd6d..f2cfd0c957 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/GEMMFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -92,13 +91,13 @@ using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallGEMMDataset(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), framework::dataset::make("DataType",
-                                                                                                           DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMDataset(), framework::dataset::make("DataType",
+                                                                                               DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f);
diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index 5d792e9e09..70ac937fa2 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/NormalizationLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -69,12 +68,12 @@ using NENormalizationLayerFixture = NormalizationValidationFixture<Tensor, Acces
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(NormalizationDataset, framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index ac5a28b527..98ec478267 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -34,7 +34,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/PoolingLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -86,14 +85,14 @@ TEST_SUITE_END()
 
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half_float::half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                               framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPoolingLayerFixture<half>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                   framework::dataset::make("DataType", DataType::F16))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
-                                                                                                                   framework::dataset::make("DataType", DataType::F16))))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), combine(PoolingLayerDatasetFP,
+                                                                                                       framework::dataset::make("DataType", DataType::F16))))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 6063384a94..18a76e8b1b 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -46,13 +46,23 @@ namespace validation
 {
 namespace
 {
+/** Scale data types */
+const auto ScaleDataTypes = framework::dataset::make("DataType",
+{
+    DataType::U8,
+    DataType::S16,
+    DataType::F32,
+});
+
+/** Tolerance */
 constexpr AbsoluteTolerance<uint8_t> tolerance(1);
+RelativeTolerance<float>             tolerance_f32(0.01);
 } // namespace
 
 TEST_SUITE(NEON)
 TEST_SUITE(Scale)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), framework::dataset::make("DataType", DataType::U8)),
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(concat(datasets::SmallShapes(), datasets::LargeShapes()), ScaleDataTypes),
                                                                            framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                    datasets::BorderModes()),
                shape, data_type, policy, border_mode)
@@ -100,10 +110,41 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combi
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::U8)),
-                                                                                                             framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                     datasets::BorderModes()))
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                             DataType::F32)),
+                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                             datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F32)),
+                                                                                                         framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                 datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_f32);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
+TEST_SUITE(Integer)
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                               DataType::U8)),
+                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                               datasets::BorderModes()))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -124,6 +165,34 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<uint8_t>, framework::DatasetMode
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance);
 }
+TEST_SUITE_END()
+TEST_SUITE(S16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
+                                                                                                               DataType::S16)),
+                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                               datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
+                                                                                                                   DataType::S16)),
+                                                                                                           framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                   datasets::BorderModes()))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, BorderSize(1), (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
 
 TEST_SUITE_END()
 TEST_SUITE_END()
diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 7ac7759c22..5ede321db1 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp
@@ -33,7 +33,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/SoftmaxLayerFixture.h"
-#include "tests/validation/half.h"
 
 namespace arm_compute
 {
@@ -100,12 +99,12 @@ using NESoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NESoftm
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
diff --git a/tests/validation/Validation.cpp b/tests/validation/Validation.cpp
index 1a082111a9..aa70e6e794 100644
--- a/tests/validation/Validation.cpp
+++ b/tests/validation/Validation.cpp
@@ -26,8 +26,8 @@
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "tests/validation/half.h"
 
 #include <array>
 #include <cmath>
@@ -79,7 +79,7 @@ double get_double_data(const void *ptr, DataType data_type)
         case DataType::S64:
             return *reinterpret_cast<const int64_t *>(ptr);
         case DataType::F16:
-            return *reinterpret_cast<const half_float::half *>(ptr);
+            return *reinterpret_cast<const half *>(ptr);
         case DataType::F32:
             return *reinterpret_cast<const float *>(ptr);
         case DataType::F64:
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index 6bc42a4ed6..e461633944 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -269,7 +269,7 @@ struct compare<RelativeTolerance<U>> : public compare_base<RelativeTolerance<U>>
             return true;
         }
 
-        const U epsilon = (std::is_same<half_float::half, typename std::remove_cv<U>::type>::value || (this->_reference == 0)) ? static_cast<U>(0.01) : std::numeric_limits<U>::epsilon();
+        const U epsilon = (std::is_same<half, typename std::remove_cv<U>::type>::value || (this->_reference == 0)) ? static_cast<U>(0.01) : std::numeric_limits<U>::epsilon();
 
         if(std::abs(static_cast<double>(this->_reference) - static_cast<double>(this->_target)) <= epsilon)
         {
diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index 53bb0f2124..ba252fbdc5 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h
@@ -56,7 +56,7 @@ public:
         const float                            scale_x = distribution_float(generator);
         const float                            scale_y = distribution_float(generator);
         std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        uint8_t                                constant_border_value = distribution_u8(generator);
+        T                                      constant_border_value = static_cast<T>(distribution_u8(generator));
 
         _target    = compute_target(shape, scale_x, scale_y, policy, border_mode, constant_border_value);
         _reference = compute_reference(shape, scale_x, scale_y, policy, border_mode, constant_border_value);
@@ -70,7 +70,7 @@ protected:
     }
 
     TensorType compute_target(const TensorShape &shape, const float scale_x, const float scale_y,
-                              InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+                              InterpolationPolicy policy, BorderMode border_mode, T constant_border_value)
     {
         // Create tensors
         TensorType  src = create_tensor<TensorType>(shape, _data_type);
@@ -103,7 +103,7 @@ protected:
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape, const float scale_x, const float scale_y,
-                                      InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
+                                      InterpolationPolicy policy, BorderMode border_mode, T constant_border_value)
     {
         // Create reference
         SimpleTensor<T> src{ shape, _data_type };