5 files changed, 28 insertions, 5 deletions
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index 1e641b48d8..60d555019d 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
@@ -60,10 +60,11 @@ public:
      *
      * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
      * @param[in]  max    Max values tensor. Data types supported: same as @p input
+     * @param[in]  beta   A scaling factor for the exponent. QS8/QS16/F16 only support a beta value of 1.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum);
+    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 70a265c1ae..e87deb6d15 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -54,9 +54,10 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  beta   A scaling factor for the exponent. QS8/QS16/F16 only support a beta value of 1.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 9b24380393..010135eb7b 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -29,6 +29,7 @@
 #define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
 #define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
 #define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
+#define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
 #define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
 #define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
 
@@ -42,6 +43,7 @@
 #define MAX_OP(x, y, type, size) max((x), (y))
 #define ADD_OP(x, y, type, size) ((x) + (y))
 #define SUB_OP(x, y, type, size) ((x) - (y))
+#define MUL_OP(x, y, type, size) ((x) * (y))
 #define DIV_OP(x, y, type, size) ((x) / (y))
 #define EXP_OP(x, type, size) exp((x))
 
@@ -128,6 +130,7 @@ __kernel void softmax_layer_max(
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (if undefined, assume it equals 1.0)
  *
  * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -175,6 +178,12 @@ __kernel void softmax_layer_shift_exp_sum(
     Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
     Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 16))BETA;
+#endif /* BETA */
+
     // Load max value of 1D logits vector (row)
     DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
 
@@ -189,6 +198,9 @@ __kernel void softmax_layer_shift_exp_sum(
         VEC_DATA_TYPE(DATA_TYPE, 16)
         data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
         data = SUB_OP(data, max_val, DATA_TYPE, 16);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 16);
+#endif /* BETA */
         data = EXP_OP(data, DATA_TYPE, 16);
         vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
@@ -199,6 +211,9 @@ __kernel void softmax_layer_shift_exp_sum(
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
     data = SUB_OP(data, max_val, DATA_TYPE, 16);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, 16);
+#endif /* BETA */
     data = EXP_OP(data, DATA_TYPE, 16);
     VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
     widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index da3b9423d5..fb066bc645 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -105,10 +105,11 @@ CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
 {
 }
 
-void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+    ARM_COMPUTE_ERROR_ON(beta != 1.0f && input->info()->data_type() != DataType::F32);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
@@ -145,6 +146,11 @@ void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTen
         build_opts.emplace("-DNON_MULTIPLE_OF_16");
     }
 
+    if(beta != 1.0f)
+    {
+        build_opts.emplace(("-DBETA=" + float_to_string_with_full_precision(beta)));
+    }
+
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
 
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7505a2c974..fa324ee61d 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -35,7 +35,7 @@ CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
 {
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
 
@@ -55,7 +55,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
 
     // Configure Kernels
     _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
     _norm_kernel.configure(&_tmp, &_sum, output);
 
     // Allocate intermediate buffers