1 files changed, 56 insertions, 4 deletions
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index 54ff858eeb..b7e395fb79 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -48,18 +48,41 @@ namespace kernels
 {
 namespace
 {
+
 /* Softmax */
 static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = {
+    {"sme2_fp32_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F32 && data.isa.sme2 && data.axis == 0); },
+     REGISTER_FP32_SME2(sme2_fp32_softmax)},
     {"neon_fp32_softmax",
      [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); },
      REGISTER_FP32_NEON(neon_fp32_softmax<false>)},
+    {"sme2_fp16_softmax",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     { return (!data.is_log && data.dt == DataType::F16 && data.isa.sme2 && data.axis == 0); },
+     REGISTER_FP16_SME2(sme2_fp16_softmax)},
     {"neon_fp16_softmax",
      [](const SoftmaxKernelDataTypeISASelectorData &data)
      { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; },
      REGISTER_FP16_NEON(neon_fp16_softmax<false>)},
+    {"sme2_qu8_softmax_lut_512VL",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     {
+         return (!data.is_log && data.dt == DataType::QASYMM8 && data.isa.sme2 && data.axis == 0 &&
+                 data.sme2_vector_length == 512);
+     },
+     REGISTER_QASYMM8_SME2(sme2_qasymm8_softmax_lut_512VL)},
     {"neon_qu8_softmax",
      [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); },
      REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)},
+    {"sme2_qs8_softmax_lut_512VL",
+     [](const SoftmaxKernelDataTypeISASelectorData &data)
+     {
+         return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED && data.isa.sme2 && data.axis == 0 &&
+                 data.sme2_vector_length == 512);
+     },
+     REGISTER_QASYMM8_SIGNED_SME2(sme2_qasymm8_signed_softmax_lut_512VL)},
     {"neon_qs8_softmax",
      [](const SoftmaxKernelDataTypeISASelectorData &data)
      { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); },
@@ -80,6 +103,28 @@ static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_ker
      REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)},
 };
 
+void init_lut(std::vector<float> &lut, DataType type, float scale, float beta)
+{
+    if (type == DataType::QASYMM8)
+    {
+        for (int i = 0; i < 256; ++i)
+        {
+            lut.push_back(std::exp(-scale * beta * i));
+        }
+    }
+    else if (type == DataType::QASYMM8_SIGNED)
+    {
+        for (int i = -128; i < 128; ++i)
+        {
+            lut.push_back(std::exp(-scale * beta * i));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Invalid datatype for QASYMM8/QASYMM8_SIGNED softmax");
+    }
+}
+
 Status validate_arguments_softmax(
     const ITensorInfo &src, const ITensorInfo &dst, float beta, int axis, const ITensorInfo &tmp, bool is_log)
 {
@@ -149,8 +194,8 @@ void CpuSoftmaxKernel::configure(
         auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(DataType::F32).reset_padding());
     }
 
-    const auto *uk = CpuSoftmaxKernel::get_implementation(
-        SoftmaxKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), is_log});
+    const auto *uk = CpuSoftmaxKernel::get_implementation(SoftmaxKernelDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel");
@@ -186,6 +231,13 @@ void CpuSoftmaxKernel::configure(
     win.set(_axis, Window::Dimension(0, 1, 1));
 
     ICpuKernel<CpuSoftmaxKernel>::configure(win);
+
+    const std::string uk_name = uk->name;
+    if (uk_name == "sme2_qu8_softmax_lut_512VL" || uk_name == "sme2_qs8_softmax_lut_512VL")
+    {
+        const float scale = src->quantization_info().uniform().scale;
+        init_lut(_lut, src->data_type(), scale, beta);
+    }
 }
 
 Status CpuSoftmaxKernel::validate(
@@ -222,11 +274,11 @@ void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const
         const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
         void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-        _run_method(src, tmp_for_thread, dst, _beta, _axis, window);
+        _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut.data());
     }
     else
     {
-        _run_method(src, nullptr, dst, _beta, _axis, window);
+        _run_method(src, nullptr, dst, _beta, _axis, window, nullptr);
     }
 }