From 48a60f9f7b0b7b5cf38253b7a2ac576aac43ef78 Mon Sep 17 00:00:00 2001
From: Pablo Palmier <Pablo.Palmier@arm.com>
Date: Wed, 18 Oct 2017 11:03:08 +0100
Subject: IVGCVSW-632 CL support for Softmax beta parameter

Change-Id: I21da48d2f40aa900301235eaced54b7eb644b0b2
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/91307
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/core/CL/cl_kernels/softmax_layer.cl      | 15 +++++++++++++++
 src/core/CL/kernels/CLSoftmaxLayerKernel.cpp |  8 +++++++-
 src/runtime/CL/functions/CLSoftmaxLayer.cpp  |  4 ++--
 3 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 9b24380393..010135eb7b 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -29,6 +29,7 @@
 #define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
 #define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
 #define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
+#define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
 #define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
 #define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
 
@@ -42,6 +43,7 @@
 #define MAX_OP(x, y, type, size) max((x), (y))
 #define ADD_OP(x, y, type, size) ((x) + (y))
 #define SUB_OP(x, y, type, size) ((x) - (y))
+#define MUL_OP(x, y, type, size) ((x) * (y))
 #define DIV_OP(x, y, type, size) ((x) / (y))
 #define EXP_OP(x, type, size) exp((x))
 
@@ -128,6 +130,7 @@ __kernel void softmax_layer_max(
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (if undefined, assume it equals 1.0)
  *
  * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -175,6 +178,12 @@ __kernel void softmax_layer_shift_exp_sum(
     Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
     Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 16))BETA;
+#endif /* BETA */
+
     // Load max value of 1D logits vector (row)
     DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
 
@@ -189,6 +198,9 @@ __kernel void softmax_layer_shift_exp_sum(
         VEC_DATA_TYPE(DATA_TYPE, 16)
         data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
         data = SUB_OP(data, max_val, DATA_TYPE, 16);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 16);
+#endif /* BETA */
         data = EXP_OP(data, DATA_TYPE, 16);
         vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
@@ -199,6 +211,9 @@ __kernel void softmax_layer_shift_exp_sum(
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
     data = SUB_OP(data, max_val, DATA_TYPE, 16);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, 16);
+#endif /* BETA */
     data = EXP_OP(data, DATA_TYPE, 16);
     VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
     widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index da3b9423d5..fb066bc645 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -105,10 +105,11 @@ CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
 {
 }
 
-void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+    ARM_COMPUTE_ERROR_ON(beta != 1.0f && input->info()->data_type() != DataType::F32);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
@@ -145,6 +146,11 @@ void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTen
         build_opts.emplace("-DNON_MULTIPLE_OF_16");
     }
 
+    if(beta != 1.0f)
+    {
+        build_opts.emplace(("-DBETA=" + float_to_string_with_full_precision(beta)));
+    }
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
 
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7505a2c974..fa324ee61d 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -35,7 +35,7 @@ CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
@@ -55,7 +55,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
 
     // Configure Kernels
     _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
     _norm_kernel.configure(&_tmp, &_sum, output);
 
     // Allocate intermediate buffers
-- 
cgit v1.2.1