From d24affe0abefe8f4a83c7d4487386920895fd2e7 Mon Sep 17 00:00:00 2001
From: Sang-Hoon Park <sang-hoon.park@arm.com>
Date: Tue, 8 Oct 2019 18:07:23 +0100
Subject: COMPMID-2265 add support for Log Softmax to NEON

Kernel (NEON/reference), validation tests, function and fixture
are updated to add support for Log Softmax

Change-Id: I641dbf1552f4128c691af8875949ebf88da71ee8
Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2075
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 .../core/NEON/kernels/NESoftmaxLayerKernel.h       |  12 +-
 .../runtime/NEON/functions/NESoftmaxLayer.h        |  44 +++---
 src/core/NEON/kernels/NESoftmaxLayerKernel.cpp     | 162 ++++++++++++++++----
 src/runtime/NEON/functions/NESoftmaxLayer.cpp      |  23 ++-
 tests/validation/NEON/LogSoftmaxLayer.cpp          | 165 +++++++++++++++++++++
 tests/validation/fixtures/SoftmaxLayerFixture.h    |  42 +++---
 tests/validation/reference/LogSoftmaxLayer.cpp     |  61 ++++++++
 tests/validation/reference/LogSoftmaxLayer.h       |  47 ++++++
 tests/validation/reference/SoftmaxLayer.cpp        |  37 ++++-
 tests/validation/reference/SoftmaxLayer.h          |   5 +-
 10 files changed, 517 insertions(+), 81 deletions(-)
 create mode 100644 tests/validation/NEON/LogSoftmaxLayer.cpp
 create mode 100644 tests/validation/reference/LogSoftmaxLayer.cpp
 create mode 100644 tests/validation/reference/LogSoftmaxLayer.h
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index 25c3196e34..fb650794fa 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,12 +69,20 @@ private:
 };
 
 /** Interface for softmax computation for QASYMM8 with pre-computed max. */
+template <bool IS_LOG = false>
 class NELogits1DSoftmaxKernel : public INEKernel
 {
 public:
     const char *name() const override
     {
-        return "NELogits1DSoftmaxKernel";
+        if(IS_LOG)
+        {
+            return "NELogits1DSoftmaxKernel";
+        }
+        else
+        {
+            return "NELogits1DLogSoftmaxKernel";
+        }
     }
     /** Default constructor */
     NELogits1DSoftmaxKernel();
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index 4932aeff5a..9cc7088ae2 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -36,29 +36,33 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to compute a SoftmaxLayer.
+/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
  * Softmax is calculated by :
  * @f[ out = \frac{e^{x - max(x)}}{\sum{e^{x - max(x)}}} @f]
  *
+ * Log Softmax is calculated by :
+ * @f[ out = (x - max(x)) - \sum{e^{x - max(x)}} @f]
+ *
  * This function runs the following kernels:
  * -# @ref NEFillBorderKernel
  * -# @ref NELogits1DMaxKernel
  * -# @ref NELogits1DSoftmaxKernel
  */
-class NESoftmaxLayer : public IFunction
+template <bool IS_LOG = false>
+class NESoftmaxLayerGeneric : public IFunction
 {
 public:
     /** Constructor */
-    NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESoftmaxLayer(const NESoftmaxLayer &) = delete;
+    NESoftmaxLayerGeneric(const NESoftmaxLayerGeneric &) = delete;
     /** Default move constructor */
-    NESoftmaxLayer(NESoftmaxLayer &&) = default;
+    NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESoftmaxLayer &operator=(const NESoftmaxLayer &) = delete;
+    NESoftmaxLayerGeneric &operator=(const NESoftmaxLayerGeneric &) = delete;
     /** Default move assignment operator */
-    NESoftmaxLayer &operator=(NESoftmaxLayer &&) = default;
+    NESoftmaxLayerGeneric &operator=(NESoftmaxLayerGeneric &&) = default;
     /** Set the input and output tensors.
      *
      * @param[in,out] input  Source tensor. Data types supported: QASYMM8/F16/F32. If the width is not a
@@ -103,17 +107,21 @@ private:
      */
     void configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis);
 
-    MemoryGroup                _memory_group;
-    NELogits1DMaxKernel        _max_kernel;
-    NELogits1DSoftmaxKernel    _softmax_kernel;
-    std::unique_ptr<INEKernel> _flat_or_reshape_kernel_ptr;
-    NEFillBorderKernel         _fill_border_kernel;
-    NEReshapeLayerKernel       _reshape_kernel;
-    Tensor                     _max;
-    Tensor                     _tmp;
-    Tensor                     _input_flattened;
-    Tensor                     _output_flattened;
-    bool                       _needs_flattening;
+    MemoryGroup                     _memory_group;
+    NELogits1DMaxKernel             _max_kernel;
+    NELogits1DSoftmaxKernel<IS_LOG> _softmax_kernel;
+    std::unique_ptr<INEKernel>      _flat_or_reshape_kernel_ptr;
+    NEFillBorderKernel              _fill_border_kernel;
+    NEReshapeLayerKernel            _reshape_kernel;
+    Tensor                          _max;
+    Tensor                          _tmp;
+    Tensor                          _input_flattened;
+    Tensor                          _output_flattened;
+    bool                            _needs_flattening;
 };
+
+using NESoftmaxLayer    = NESoftmaxLayerGeneric<false>;
+using NELogSoftmaxLayer = NESoftmaxLayerGeneric<true>;
+
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NESOFTMAXLAYER_H__ */
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 4144a1877b..1003ebd2e3 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -333,6 +333,19 @@ float32x4x4_t vadd(float32x4x4_t a, float32x4x4_t b)
     return res;
 }
 
+float32x4x4_t vsub_n(float32x4x4_t a, float val)
+{
+    auto          scalar_vector = vdup_n<float32x4x4_t>(val);
+    float32x4x4_t res           = { {
+            vsubq_f32(a.val[0], scalar_vector.val[0]),
+            vsubq_f32(a.val[1], scalar_vector.val[1]),
+            vsubq_f32(a.val[2], scalar_vector.val[2]),
+            vsubq_f32(a.val[3], scalar_vector.val[3])
+        }
+    };
+    return res;
+}
+
 namespace
 {
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
@@ -590,6 +603,7 @@ elem_type_t<V> reduce_add(F add_fn, V vec)
     return reduce_add_impl < elem_type_t<V>, N, 0, N - 1 >::reduce(add_fn, vec);
 }
 
+template <bool is_log>
 void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window)
 {
     const int start_x     = in.info()->valid_region().anchor.x();
@@ -608,7 +622,8 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons
         const auto out_ptr = reinterpret_cast<qasymm8_t *>(out_it.ptr()) + start_x;
         const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-        float sum_inversed;
+        float sum{};
+        float sum_inversed{};
 
         /* Compute exponentials and sum */
         {
@@ -622,33 +637,55 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons
             /* Loop over row and compute exponentials and sum */
             int           i        = 0;
             constexpr int vec_size = vec_size_of(vec_max);
+
             for(; i <= (input_width - vec_size); i += vec_size)
             {
                 auto vec_elements = vld<vec_16_byte_t<qasymm8_t>>(in_ptr + i);
                 vec_elements      = vsubq_u8(vec_max, vec_elements);
 
                 auto vec_elements_flt = vcvt<float32x4x4_t>(vec_elements);
-                vec_elements_flt      = vexp(vmul_n(vec_elements_flt, scale_beta));
-
-                vec_sum = vadd(vec_sum, vec_elements_flt);
 
+                if(is_log)
+                {
+                    vec_elements_flt = vmul_n(vec_elements_flt, scale_beta);
+                    vec_sum          = vadd(vec_sum, vexp(vec_elements_flt));
+                }
+                else
+                {
+                    vec_elements_flt = vexp(vmul_n(vec_elements_flt, scale_beta));
+                    vec_sum          = vadd(vec_sum, vec_elements_flt);
+                }
                 vst4q_f32(tmp_ptr + i, vec_elements_flt);
             }
+
             /* Reduce sum */
             const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]),
                                                vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
             const auto sum_8_byte = vadd_f32(vget_low(sum_16_byte), vget_high(sum_16_byte));
-            float      sum        = reduce_add(std::plus<float>(), sum_8_byte);
+            sum                   = reduce_add(std::plus<float>(), sum_8_byte);
 
             /* Run remaining elements */
             for(; i < input_width; ++i)
             {
-                const float element = std::exp((max_val - in_ptr[i]) * scale_beta);
-                sum += element;
+                float element{};
+                if(is_log)
+                {
+                    element = (max_val - in_ptr[i]) * scale_beta;
+                    sum += std::exp(element);
+                }
+                else
+                {
+                    element = std::exp((max_val - in_ptr[i]) * scale_beta);
+                    sum += element;
+                }
+
                 tmp_ptr[i] = element;
             }
 
-            sum_inversed = 256.f / sum;
+            if(!is_log)
+            {
+                sum_inversed = 256.f / sum;
+            }
         }
 
         /* Normalize exponentials */
@@ -657,24 +694,40 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons
             int i = 0;
             {
                 constexpr int vec_size = 16;
+
                 for(; i <= (input_width - vec_size); i += vec_size)
                 {
-                    float32x4x4_t vec_in           = vld4q_f32(tmp_ptr + i);
-                    auto          normalized_value = vcvt<vec_16_byte_t<qasymm8_t>>(vmul_n(vec_in, sum_inversed));
+                    float32x4x4_t            vec_in = vld4q_f32(tmp_ptr + i);
+                    vec_16_byte_t<qasymm8_t> normalized_value{};
+                    if(is_log)
+                    {
+                        normalized_value = vcvt<vec_16_byte_t<qasymm8_t>>(vsub_n(vec_in, sum));
+                    }
+                    else
+                    {
+                        normalized_value = vcvt<vec_16_byte_t<qasymm8_t>>(vmul_n(vec_in, sum_inversed));
+                    }
                     vst(out_ptr + i, normalized_value);
                 }
             }
             /* Run remaining elements */
             for(; i < input_width; ++i)
             {
-                out_ptr[i] = utils::cast::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
+                if(is_log)
+                {
+                    out_ptr[i] = utils::cast::saturate_cast<qasymm8_t>(tmp_ptr[i] - sum);
+                }
+                else
+                {
+                    out_ptr[i] = utils::cast::saturate_cast<qasymm8_t>(tmp_ptr[i] * sum_inversed);
+                }
             }
         }
     },
     in_it, max_it, out_it);
 }
 
-template <typename T>
+template <typename T, bool is_log = false>
 void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp,
                              ITensor &out, const float beta, const Window &window)
 {
@@ -692,7 +745,8 @@ void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const
         const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
         const auto tmp_ptr = reinterpret_cast<T *>(tmp);
 
-        T sum_inversed;
+        T sum{};
+        T sum_inversed{};
 
         /* Compute exponentials and sum */
         {
@@ -706,46 +760,87 @@ void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const
             /* Loop over row and compute exponentials and sum */
             int           i        = 0;
             constexpr int vec_size = vec_size_of(vec_sum);
+
             for(; i <= (input_width - vec_size); i += vec_size)
             {
                 auto vec_elements = vld<vec_16_byte_t<T>>(in_ptr + i);
                 vec_elements      = vsub(vec_elements, vec_max);
-                vec_elements      = vexp(vmul_n(vec_elements, static_cast<T>(beta)));
-                vec_sum           = vadd(vec_sum, vec_elements);
+                if(is_log)
+                {
+                    vec_elements = vmul_n(vec_elements, static_cast<T>(beta));
+                    vec_sum      = vadd(vec_sum, vexp(vec_elements));
+                }
+                else
+                {
+                    vec_elements = vexp(vmul_n(vec_elements, static_cast<T>(beta)));
+                    vec_sum      = vadd(vec_sum, vec_elements);
+                }
                 vst(tmp_ptr + i, vec_elements);
             }
+
             /* Reduce sum */
             const auto sum_8_byte = vadd(vget_high(vec_sum), vget_low(vec_sum));
-            T sum                 = reduce_add([](T a, T b) -> T { return a + b; }, sum_8_byte);
+            sum                   = reduce_add([](T a, T b) -> T { return a + b; }, sum_8_byte);
 
             /* Run remaining elements */
+
             for(; i < input_width; ++i)
             {
-                T element = std::exp((in_ptr[i] - max_val) * beta);
-                sum += element;
+                T element{};
+
+                if(is_log)
+                {
+                    element = (in_ptr[i] - max_val) * beta;
+                    sum += std::exp(element);
+                }
+                else
+                {
+                    element = std::exp((in_ptr[i] - max_val) * beta);
+                    sum += element;
+                }
                 tmp_ptr[i] = element;
             }
 
-            sum_inversed = T(1) / sum;
+            if(!is_log)
+            {
+                sum_inversed = T(1) / sum;
+            }
         }
 
         /* Normalize exponentials */
         {
             /* Loop over row and compute softmax */
             int i = 0;
+
             {
                 constexpr int vec_size = vec_size_of(vec_16_byte_t<T> {});
+
                 for(; i <= (input_width - vec_size); i += vec_size)
                 {
-                    auto             vec_in           = vld<vec_16_byte_t<T>>(tmp_ptr + i);
-                    vec_16_byte_t<T> normalized_value = vmul_n(vec_in, sum_inversed);
+                    auto             vec_in = vld<vec_16_byte_t<T>>(tmp_ptr + i);
+                    vec_16_byte_t<T> normalized_value{};
+                    if(is_log)
+                    {
+                        normalized_value = vsub(vec_in, vdup_n<vec_16_byte_t<T>>(sum));
+                    }
+                    else
+                    {
+                        normalized_value = vmul_n(vec_in, sum_inversed);
+                    }
                     vst(out_ptr + i, normalized_value);
                 }
             }
             /* Run remaining elements */
             for(; i < input_width; ++i)
             {
-                out_ptr[i] = tmp_ptr[i] * sum_inversed;
+                if(is_log)
+                {
+                    out_ptr[i] = tmp_ptr[i] - sum;
+                }
+                else
+                {
+                    out_ptr[i] = tmp_ptr[i] * sum_inversed;
+                }
             }
         }
     },
@@ -753,12 +848,14 @@ void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const
 }
 } // namespace
 
-NELogits1DSoftmaxKernel::NELogits1DSoftmaxKernel()
+template <bool IS_LOG>
+NELogits1DSoftmaxKernel<IS_LOG>::NELogits1DSoftmaxKernel()
     : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _beta(1.0f), _tmp(nullptr)
 {
 }
 
-void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp)
+template <bool IS_LOG>
+void NELogits1DSoftmaxKernel<IS_LOG>::configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), max->info(), output->info(), tmp->info());
@@ -771,15 +868,15 @@ void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max
     switch(input->info()->data_type())
     {
         case DataType::QASYMM8:
-            _func = &logits_1d_softmax_qasymm8;
+            _func = &logits_1d_softmax_qasymm8<IS_LOG>;
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &logits_1d_softmax_float<float16_t>;
+            _func = &logits_1d_softmax_float<float16_t, IS_LOG>;
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
-            _func = &logits_1d_softmax_float<float>;
+            _func = &logits_1d_softmax_float<float, IS_LOG>;
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
@@ -795,8 +892,9 @@ void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max
     INEKernel::configure(win_config.second);
 }
 
-Status NELogits1DSoftmaxKernel::validate(const ITensorInfo *input, const ITensorInfo *max,
-                                         const ITensorInfo *output, const float beta, const ITensorInfo *tmp)
+template <bool IS_LOG>
+Status NELogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *max,
+                                                 const ITensorInfo *output, const float beta, const ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, output, tmp);
 
@@ -806,7 +904,8 @@ Status NELogits1DSoftmaxKernel::validate(const ITensorInfo *input, const ITensor
     return Status{};
 }
 
-void NELogits1DSoftmaxKernel::run(const Window &window, const ThreadInfo &info)
+template <bool IS_LOG>
+void NELogits1DSoftmaxKernel<IS_LOG>::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -822,4 +921,7 @@ void NELogits1DSoftmaxKernel::run(const Window &window, const ThreadInfo &info)
     (*_func)(*_input, *_max, tmp_for_thread, *_output, _beta, window);
 }
 
+template class NELogits1DSoftmaxKernel<true>;
+template class NELogits1DSoftmaxKernel<false>;
+
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 79a94961d8..f530a87d05 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -33,13 +33,15 @@
 
 namespace arm_compute
 {
-NESoftmaxLayer::NESoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
+template <bool IS_LOG>
+NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
       _output_flattened(), _needs_flattening(false)
 {
 }
 
-void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, size_t axis)
 {
     // Flatten the input
     const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
@@ -68,11 +70,12 @@ void NESoftmaxLayer::configure_reshape_input_kernel(const ITensor *input, const
     auto_init_if_empty(*output->info(), *input->info()->clone());
 }
 
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
+template <bool IS_LOG>
+void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+    ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
 
     // We don't need flattening only in the case the input is 2D and axis is 1
     _needs_flattening = axis != 1;
@@ -138,7 +141,8 @@ void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size
     _tmp.allocator()->allocate();
 }
 
-Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
+template <bool IS_LOG>
+Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -173,12 +177,13 @@ Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *out
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, output, beta, &dont_care));
 
     return Status{};
 }
 
-void NESoftmaxLayer::run()
+template <bool IS_LOG>
+void           NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
@@ -196,4 +201,8 @@ void NESoftmaxLayer::run()
         NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
     }
 }
+
+template class NESoftmaxLayerGeneric<false>;
+template class NESoftmaxLayerGeneric<true>;
+
 } // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp
new file mode 100644
index 0000000000..e35c8fd8a2
--- /dev/null
+++ b/tests/validation/NEON/LogSoftmaxLayer.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/SoftmaxLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Tolerance for float operations */
+constexpr RelativeTolerance<float> tolerance_f32(0.00001f);
+RelativeTolerance<half>            tolerance_f16(half(0.2));
+
+/** Tolerance for quantized operations */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+
+/** CNN data types */
+const auto CNNDataTypes = framework::dataset::make("DataType",
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    DataType::F16,
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    DataType::F32,
+});
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(LogSoftmaxLayer)
+
+template <typename T>
+using NELogSoftmaxLayerFixture = SoftmaxValidationFixture<Tensor, Accessor, NELogSoftmaxLayer, T, true>;
+
+TEST_SUITE(Float)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                            framework::dataset::make("Axis", { 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
+                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                      framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                              framework::dataset::make("Axis", { 1, 2, 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
+                                                                                                                  framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                          framework::dataset::make("Axis", { 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() //FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                               framework::dataset::make("Axis", { 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
+                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                               framework::dataset::make("Axis", { 1, 2, 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
+                                                                                                           framework::dataset::make("Axis", { 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() //FP32
+TEST_SUITE_END() //Float
+
+template <typename T>
+using NELogSoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<Tensor, Accessor, NELogSoftmaxLayer, T, true>;
+
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                    combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.f }))),
+                                                                                                                    framework::dataset::make("Axis", { 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::Small4DShapes(),
+                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                    combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.f }))),
+                                                                                                                    framework::dataset::make("Axis", { 1, 2, 3 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
+                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                      combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                                                                                                                              framework::dataset::make("Beta", { 1.0f, 2.0f }))),
+                                                                                                                      framework::dataset::make("Axis", { 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() //QASYMM8
+TEST_SUITE_END() //Quantized
+
+TEST_SUITE_END() //LogSoftmaxLayer
+TEST_SUITE_END() //NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index e39ee74800..f747ab3574 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@
 #include "tests/IAccessor.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Fixture.h"
+#include "tests/validation/reference/LogSoftmaxLayer.h"
 #include "tests/validation/reference/SoftmaxLayer.h"
 
 #include <random>
@@ -42,7 +43,7 @@ namespace test
 {
 namespace validation
 {
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool IS_LOG = false>
 class SoftmaxValidationGenericFixture : public framework::Fixture
 {
 public:
@@ -110,7 +111,14 @@ protected:
         // Fill reference
         fill(src);
 
-        return reference::softmax_layer<T>(src, beta, axis);
+        if(IS_LOG)
+        {
+            return reference::log_softmax_layer<T>(src, beta, axis);
+        }
+        else
+        {
+            return reference::softmax_layer<T>(src, beta, axis);
+        }
     }
 
     TensorType       _target{};
@@ -118,33 +126,33 @@ protected:
     QuantizationInfo _quantization_info{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class SoftmaxValidationFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool IS_LOG = false>
+class SoftmaxValidationFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>
 {
 public:
     template <typename...>
     void setup(TensorShape shape, DataType data_type, float beta, size_t axis)
     {
-        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape,
-                                                                                          data_type,
-                                                                                          QuantizationInfo(),
-                                                                                          beta,
-                                                                                          axis);
+        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>::setup(shape,
+                                                                                                  data_type,
+                                                                                                  QuantizationInfo(),
+                                                                                                  beta,
+                                                                                                  axis);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
-class SoftmaxValidationQuantizedFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool IS_LOG = false>
+class SoftmaxValidationQuantizedFixture : public SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>
 {
 public:
     template <typename...>
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, float beta, size_t axis)
     {
-        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(shape,
-                                                                                          data_type,
-                                                                                          quantization_info,
-                                                                                          beta,
-                                                                                          axis);
+        SoftmaxValidationGenericFixture<TensorType, AccessorType, FunctionType, T, IS_LOG>::setup(shape,
+                                                                                                  data_type,
+                                                                                                  quantization_info,
+                                                                                                  beta,
+                                                                                                  axis);
     }
 };
 } // namespace validation
diff --git a/tests/validation/reference/LogSoftmaxLayer.cpp b/tests/validation/reference/LogSoftmaxLayer.cpp
new file mode 100644
index 0000000000..3f21d85dd0
--- /dev/null
+++ b/tests/validation/reference/LogSoftmaxLayer.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "LogSoftmaxLayer.h"
+#include "SoftmaxLayer.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+{
+    return softmax_layer_generic<T>(src, beta, axis, true);
+}
+
+template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+{
+    // Note: Output quantization info should always have scale = 1/256 and offset = 0
+    const QuantizationInfo output_quantization_info = QuantizationInfo(1.f / 256, 0);
+
+    SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float> dst_tmp = log_softmax_layer<float>(src_tmp, beta, axis);
+    SimpleTensor<T>     dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_quantization_info);
+    return dst;
+}
+
+template SimpleTensor<float> log_softmax_layer(const SimpleTensor<float> &src, float beta, size_t axis);
+template SimpleTensor<half> log_softmax_layer(const SimpleTensor<half> &src, float beta, size_t axis);
+template SimpleTensor<uint8_t> log_softmax_layer(const SimpleTensor<uint8_t> &src, float beta, size_t axis);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/LogSoftmaxLayer.h b/tests/validation/reference/LogSoftmaxLayer.h
new file mode 100644
index 0000000000..35547cabad
--- /dev/null
+++ b/tests/validation/reference/LogSoftmaxLayer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H__
+#define __ARM_COMPUTE_TEST_LOG_SOFTMAX_LAYER_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
+
+template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type = 0>
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_SOFTMAX_LAYER_H__ */
diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index fabc62bedb..ef2468df59 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp
@@ -34,7 +34,7 @@ namespace validation
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, size_t axis, bool is_log)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
@@ -65,23 +65,48 @@ SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axi
 
         // Regularize
         T sum(0.f);
-        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta](T val)
+        std::transform(src_row_ptr, src_row_ptr + lower_dims, dst_row_ptr, [&sum, max, beta, is_log](T val)
         {
-            const T res(std::exp((val - max) * beta));
-            sum += res;
+            T res{ (val - max) *beta };
+
+            if(is_log)
+            {
+                sum += std::exp(res);
+            }
+            else
+            {
+                res = std::exp(res);
+                sum += res;
+            }
             return res;
         });
 
         // Normalize
-        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum](T val)
+        std::transform(dst_row_ptr, dst_row_ptr + lower_dims, dst_row_ptr, [sum, is_log](T val)
         {
-            return val / sum;
+            if(is_log)
+            {
+                return val - sum;
+            }
+            else
+            {
+                return val / sum;
+            }
         });
     }
 
     return dst;
 }
 
+template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, size_t axis, bool is_log);
+template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, size_t axis, bool is_log);
+
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
+{
+    return softmax_layer_generic<T>(src, beta, axis, false);
+}
+
 template <typename T, typename std::enable_if<std::is_same<T, uint8_t>::value, int>::type>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis)
 {
diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index d21ca2bf20..fa9485ce31 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,9 @@ namespace validation
 {
 namespace reference
 {
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, size_t axis, bool is_log = false);
+
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
 SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, size_t axis = 1);
 
-- 
cgit v1.2.1