From 62eeb53a5eee9d388a6074553175909fd1b441b5 Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Tue, 29 Oct 2019 13:13:19 +0000
Subject: COMPMID-2266: [CL] add support for Log Softmax

Change-Id: I4a8f3519328553e24cbb4fe45a8ca4d47c90975d
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2182
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/softmax_layer.cl           | 41 ++++++++++++++++++++++-
 src/core/CL/cl_kernels/softmax_layer_quantized.cl | 11 ++++--
 src/core/CL/kernels/CLSoftmaxLayerKernel.cpp      | 10 ++++--
 src/runtime/CL/functions/CLSoftmaxLayer.cpp       | 30 ++++++++++++-----
 4 files changed, 77 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index e549b44245..767cf4c4f7 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,7 +111,11 @@ __kernel void softmax_layer_norm(
     DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
+#ifdef LOG_SOFTMAX
+    vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+#else  /* LOG_SOFTMAX */
     vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
+#endif /* LOG_SOFTMAX */
 }
 
 /** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
@@ -226,9 +230,15 @@ __kernel void softmax_layer_max_shift_exp_sum_serial(
 #ifdef BETA
         data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
 #endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+#else  /* LOG_SOFTMAX */
         data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
         VSTORE(VECTOR_SIZE)
         (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+#endif /* LOG_SOFTMAX */
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
     }
 
@@ -239,11 +249,19 @@ __kernel void softmax_layer_max_shift_exp_sum_serial(
 #ifdef BETA
     data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
 #endif /* BETA */
+#ifdef LOG_SOFTMAX
+    VSTORE(VECTOR_SIZE)
+    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    data = select(0, data, widx);
+#else  /* LOG_SOFTMAX */
     data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
     widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
     data = select(0, data, widx);
     VSTORE(VECTOR_SIZE)
     (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+#endif /* LOG_SOFTMAX */
     sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 
@@ -455,9 +473,15 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
 #ifdef BETA
         data = MUL_OP(data, beta, DATA_TYPE, 4);
 #endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        data = EXP_OP(data, DATA_TYPE, 4);
+#else  /* LOG_SOFTMAX */
         data = EXP_OP(data, DATA_TYPE, 4);
         VSTORE(4)
         (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+#endif /* LOG_SOFTMAX */
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
@@ -471,9 +495,15 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
 #ifdef BETA
         data = MUL_OP(data, beta, DATA_TYPE, 4);
 #endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        data = EXP_OP(data, DATA_TYPE, 4);
+#else  /* LOG_SOFTMAX */
         data = EXP_OP(data, DATA_TYPE, 4);
         VSTORE(4)
         (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+#endif /* LOG_SOFTMAX */
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
     }
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
@@ -491,12 +521,21 @@ __kernel void softmax_layer_max_shift_exp_sum_parallel(
 #ifdef BETA
         data = MUL_OP(data, beta, DATA_TYPE, 4);
 #endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = select(0, data, widx);
+#else  /* LOG_SOFTMAX */
         data = EXP_OP(data, DATA_TYPE, 4);
         VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
         widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
         data = select(0, data, widx);
         VSTORE(4)
         (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+#endif /* LOG_SOFTMAX */
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
     }
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 95d6d4bcc5..8ccc5d3dd5 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #define MAX_OP(x, y, type, size) max((x), (y))
 #define ADD_OP(x, y, type, size) ((x) + (y))
+#define SUB_OP(x, y, type, size) ((x) - (y))
 
 /* Number of workitems in dimension 0. */
 #if !defined(GRID_SIZE)
@@ -559,12 +560,14 @@ __kernel void softmax_layer_norm_quantized(
     int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
 
     // It will be better to calculate this in prev layer and pass here as parameter
-    uint  sum_val_u               = convert_uint(sum_val);
+    uint sum_val_u = convert_uint(sum_val);
+#ifndef LOG_SOFTMAX
     int   headroom_plus_one       = clz(sum_val_u);
     int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
     int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
     int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
     int16 shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
+#endif /* LOG_SOFTMAX */
 
     // It was already calculated in prev layer, should be stored into tmp output and reused
     int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
@@ -577,8 +580,12 @@ __kernel void softmax_layer_norm_quantized(
 #endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
     int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
 
+#ifdef LOG_SOFTMAX
+    data = SUB_OP(data_diff_mult, (int16)sum_val_u, int, 16);
+#else  /* LOG_SOFTMAX */
     data = ASYMM_MULT(shifted_scale, data, 16);
     data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
+#endif /* LOG_SOFTMAX */
     data = select(0, data, data_diff >= (int16)(DIFF_MIN));
     vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));
 }
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index a9c08703c0..f24c25f507 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
@@ -217,7 +218,7 @@ CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
 {
 }
 
-void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
+void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
 
@@ -236,6 +237,7 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
     const DataType                dt                 = input->info()->data_type();
     const UniformQuantizationInfo qinfo              = input->info()->quantization_info().uniform();
     const size_t                  reduction_dim_size = input->info()->dimension(0);
+    const float                   beta               = info.beta;
 
     // Set build options
     CLBuildOptions build_opts;
@@ -243,6 +245,7 @@ void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor
     build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
     build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
+    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
 
     cl::NDRange lws_hint(cl::NullRange);
     std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("softmax_layer_max_shift_exp_sum_quantized_serial") :
@@ -334,7 +337,7 @@ CLLogits1DNormKernel::CLLogits1DNormKernel()
 {
 }
 
-void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta)
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
 
@@ -359,7 +362,8 @@ void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *su
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_options_if(is_quantized_asymmetric,
-                              prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
+                              prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options());
+    build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX");
 
     // Create kernel
     std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 73add97ef1..32d7f4423d 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -34,13 +34,15 @@
 
 namespace arm_compute
 {
-CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+template <bool IS_LOG>
+CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
       _needs_flattening(false)
 {
 }
 
-void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
 {
     // Flatten the input
     const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -69,11 +71,12 @@ void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, cons
     auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
 
     // We don't need flattening only in the case the input is 2D and axis is 1
     _needs_flattening = axis != 1;
@@ -114,8 +117,12 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
     _memory_group.manage(&_max);
     _memory_group.manage(&_sum);
 
+    SoftmaxKernelInfo softmax_info;
+    softmax_info.beta   = beta;
+    softmax_info.is_log = IS_LOG;
+
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, softmax_info);
 
     if(_needs_flattening)
     {
@@ -123,7 +130,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
         _memory_group.manage(&_output_flattened);
 
         // The normalization kernel stores the result in a flat output tensor
-        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, beta);
+        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, softmax_info);
 
         // Reshape the flat output into a the requested (4D) output
         _reshape_kernel.configure(&_output_flattened, output);
@@ -135,7 +142,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
     else
     {
         // Softmax 2D case
-        _norm_kernel.configure(&_tmp, &_sum, output, beta);
+        _norm_kernel.configure(&_tmp, &_sum, output, softmax_info);
     }
 
     // Allocate intermediate buffers
@@ -144,7 +151,8 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
     _sum.allocator()->allocate();
 }
 
-Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+template <bool IS_LOG>
+Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
@@ -188,7 +196,8 @@ Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
     return Status{};
 }
 
-void CLSoftmaxLayer::run()
+template <bool IS_LOG>
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
@@ -206,4 +215,7 @@ void CLSoftmaxLayer::run()
     }
 }
 
+template class CLSoftmaxLayerGeneric<false>;
+template class CLSoftmaxLayerGeneric<true>;
+
 } // namespace arm_compute
-- 
cgit v1.2.1