IVGCVSW-619: Support for Cl u8 bounded Relu

Change-Id: I3c39ecbd36f06d5376c35ed4eb38dd73533ef97e Reviewed-on: http://mpd-gerrit.cambridge.arm.com/93686 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Michel Iwaniec <michel.iwaniec@arm.com> 2017-10-12 14:14:15 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:35:24 +0000
commit: 0063380ca6e43d04722707c707e610b59e1f8dde (patch)
tree: c60f6e5b380851cefd5aa994b75d3e4ab3484055 /src/core
parent: 27c9efb922832e5e6785a492e84a46934d9a47f8 (diff)
download: ComputeLibrary-0063380ca6e43d04722707c707e610b59e1f8dde.tar.gz
5 files changed, 161 insertions, 5 deletions
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 821fb4c051..09ec329e4c 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -72,6 +72,8 @@ std::string get_cl_type_from_data_type(const DataType &dt)
             return "qs8";
         case DataType::S8:
             return "char";
+        case DataType::QASYMM8:
+            return "uchar";
         case DataType::U16:
             return "ushort";
         case DataType::S16:
@@ -105,6 +107,7 @@ std::string get_data_size_from_data_type(const DataType &dt)
         case DataType::U8:
         case DataType::QS8:
         case DataType::S8:
+        case DataType::QASYMM8:
             return "8";
         case DataType::U16:
         case DataType::S16:
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6e5e802538..62ef2593e7 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -107,6 +107,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "accumulate_squared", "accumulate.cl" },
     { "accumulate_weighted", "accumulate.cl" },
     { "activation_layer", "activation_layer.cl" },
+    { "activation_layer_qa8", "activation_layer_qa8.cl" },
     { "arithmetic_add", "arithmetic_op.cl" },
     { "arithmetic_sub", "arithmetic_op.cl" },
     { "bitwise_or", "bitwise_op.cl" },
@@ -306,6 +307,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
 #include "./cl_kernels/activation_layer.clembed"
     },
     {
+        "activation_layer_qa8.cl",
+#include "./cl_kernels/activation_layer_qa8.clembed"
+    },
+    {
         "arithmetic_op.cl",
 #include "./cl_kernels/arithmetic_op.clembed"
     },
diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
new file mode 100644
index 0000000000..4d9bf0efad
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max(0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+/** This performs an activation function on QASYMM8 inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer_qa8(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+    data = ACTIVATION_OP(ACT, data);
+
+    // requantize to output space
+    float16 fdata = convert_float16(data);
+    fdata         = round((fdata - O1_VAL) * (S1_VAL / S2_VAL) + O2_VAL);
+    uchar16 qdata = convert_uchar16(fdata);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (qdata, 0, (__global DATA_TYPE *)output.ptr);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 18202c1c5b..bed407a3d1 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -34,6 +34,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -47,7 +50,14 @@ CLActivationLayerKernel::CLActivationLayerKernel()
 
 void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32, DataType::QASYMM8);
+
+    // For QA8 only lower/upper bounded relu is supported
+    if(input->info()->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                 "For QASYMM8 only lower/upper bounded relu is supported");
+    }
 
     if(output != nullptr)
     {
@@ -74,8 +84,22 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
-    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+
+    if(input->info()->data_type() == DataType::QASYMM8)
+    {
+        // For lower/upper bounded relu make sure that the min/max values are in the quantized input space
+        int a_const_u8 = input->info()->quantization_info().quantize(a_const);
+        int b_const_u8 = input->info()->quantization_info().quantize(b_const);
+
+        build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_u8)));
+        build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_u8)));
+    }
+    else
+    {
+        build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
+        build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+    }
+
     build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
     if(is_data_type_fixed_point(input->info()->data_type()))
     {
@@ -83,7 +107,23 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+    if(input->info()->data_type() == DataType::QASYMM8)
+    {
+        float s1 = input->info()->quantization_info().scale;
+        float o1 = input->info()->quantization_info().offset;
+        // If output is nullptr, assume same quantization scale/offset as input
+        float s2 = output != nullptr ? output->info()->quantization_info().scale : s1;
+        float o2 = output != nullptr ? output->info()->quantization_info().offset : o1;
+        build_opts.emplace(("-DS1_VAL=" + support::cpp11::to_string(s1)));
+        build_opts.emplace(("-DS2_VAL=" + support::cpp11::to_string(s2)));
+        build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+        build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer_qa8", build_opts));
+    }
+    else
+    {
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+    }
 
     // Make sure _kernel is initialized before calling the parent's configure
 
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 91a35315dc..f3cd776497 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -26,13 +26,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
 using namespace arm_compute;
 
 TensorInfo::TensorInfo()
     : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
-      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info()
 {
 }
 
@@ -80,6 +81,13 @@ TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, Dat
     init(tensor_shape, num_channels, data_type, fixed_point_position);
 }
 
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+    : TensorInfo()
+{
+    init(tensor_shape, num_channels, data_type, 0);
+    _quantization_info = quantization_info;
+}
+
 TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
     : TensorInfo()
 {
author	Michel Iwaniec <michel.iwaniec@arm.com>	2017-10-12 14:14:15 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:35:24 +0000
commit	0063380ca6e43d04722707c707e610b59e1f8dde (patch)
tree	c60f6e5b380851cefd5aa994b75d3e4ab3484055 /src/core
parent	27c9efb922832e5e6785a492e84a46934d9a47f8 (diff)
download	ComputeLibrary-0063380ca6e43d04722707c707e610b59e1f8dde.tar.gz