5 files changed, 276 insertions, 8 deletions
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 0adf28af63..4b61ee3a1e 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -51,18 +51,18 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
     ARM_COMPUTE_ERROR_ON(!is_data_type_quantized(src->data_type()));
     ARM_COMPUTE_ERROR_ON(src->element_size() != 1);
 
-    auto lut = std::unique_ptr<uint8_t[]>(new uint8_t[256]);
+    auto       lut       = std::unique_ptr<uint8_t[]>(new uint8_t[256]);
     const auto is_signed = src->data_type() == DataType::QASYMM8_SIGNED;
-    const auto src_qi = src->quantization_info().uniform();
-    const auto dst_qi = dst->quantization_info().uniform();
+    const auto src_qi    = src->quantization_info().uniform();
+    const auto dst_qi    = dst->quantization_info().uniform();
 
     const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale;
     const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale;
 
     for(int i = 0; i < 256; ++i)
     {
-        const auto in = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
-        float result = 0;
+        const auto in     = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
+        float      result = 0;
 
         switch(op)
         {
@@ -101,7 +101,7 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo
         result = utility::clamp(result, dst_min_fp, dst_max_fp);
 
         const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi);
-        lut[i] = out;
+        lut[i]         = out;
     }
 
     return lut;
@@ -174,7 +174,7 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
         },
         REGISTER_QASYMM8_SVE(sve_q8_elementwise_unary),
         &q8_prepare_lut,
-        },
+    },
     {
         "neon_q8_elementwise_unary",
         [](const DataTypeISASelectorData & data)
@@ -184,6 +184,25 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai
         REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary),
         &q8_prepare_lut,
     },
+#else  // __aarch64__
+    {
+        "neon_qasymm8_signed_elementwise_unary",
+        [](const DataTypeISASelectorData & data)
+        {
+            return data.dt == DataType::QASYMM8_SIGNED;
+        },
+        REGISTER_QASYMM8_NEON(neon_qasymm8_signed_elementwise_unary),
+        nullptr,
+    },
+    {
+        "neon_qasymm8_elementwise_unary",
+        [](const DataTypeISASelectorData & data)
+        {
+            return data.dt == DataType::QASYMM8;
+        },
+        REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary),
+        nullptr,
+    },
 #endif // __aarch64__
 };
 
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp
index 30caa4ebeb..2b23e46f40 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,8 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+#include "arm_compute/core/Helpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 namespace arm_compute
 {
@@ -111,5 +113,166 @@ template void elementwise_op<__fp16>(const ITensor *in, ITensor *out, const Wind
 template void elementwise_op<float>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
 template void elementwise_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
 
+template <>
+void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int                     window_step_x     = 16;
+    const auto                    window_start_x    = static_cast<int>(window.x().start());
+    const auto                    window_end_x      = static_cast<int>(window.x().end());
+    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
+    const auto                    min_clamped_value = vdupq_n_f32((-128 - qi_out.offset) * qi_out.scale);
+    const auto                    max_clamped_value = vdupq_n_f32((127 - qi_out.offset) * qi_out.scale);
+    Window                        win               = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int8x16_t  vout;
+        auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
+        const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
+        const auto vconst_0_f32  = vdupq_n_f32(0);
+        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+
+        int x = window_start_x;
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+
+            // De-quantize
+            const auto vin_deq = vdequantize(vin, qi_in);
+
+            // Perform activation
+            float32x4x4_t vtmp_deq =
+            {
+                {
+                    elementwise_op_imp<float>(op, vin_deq.val[0]),
+                    elementwise_op_imp<float>(op, vin_deq.val[1]),
+                    elementwise_op_imp<float>(op, vin_deq.val[2]),
+                    elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }
+            };
+
+            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+            {
+                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+            }
+
+            // Re-quantize to new output space
+            vout = vquantize_signed(vtmp_deq, qi_out);
+            wrapper::vstore(output_ptr + x, vout);
+        }
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+            qasymm8_signed_t tmp   = 0;
+            float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
+            if(tmp_f <= 0.0)
+            {
+                if(op == ElementWiseUnary::LOG)
+                {
+                    tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+                }
+                else if(op == ElementWiseUnary::RSQRT)
+                {
+                    tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                }
+            }
+            else
+            {
+                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+            }
+            tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+            // For aarch64 LUT is used and rounding to nearest is used
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+template <>
+void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const int                     window_step_x     = 16;
+    const auto                    window_start_x    = static_cast<int>(window.x().start());
+    const auto                    window_end_x      = static_cast<int>(window.x().end());
+    const UniformQuantizationInfo qi_in             = in->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out            = out->info()->quantization_info().uniform();
+    const auto                    vconst_0_f32      = vdupq_n_f32(0);
+    const auto                    min_clamped_value = vdupq_n_f32((0 - qi_out.offset) * qi_out.scale);
+    const auto                    max_clamped_value = vdupq_n_f32((255 - qi_out.offset) * qi_out.scale);
+    Window                        win               = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        uint8x16_t vout;
+        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+        auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
+        const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
+        int        x             = window_start_x;
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+
+            // De-quantize
+            const auto vin_deq = vdequantize(vin, qi_in);
+
+            // Perform activation
+            float32x4x4_t vtmp_deq =
+            {
+                {
+                    elementwise_op_imp<float>(op, vin_deq.val[0]),
+                    elementwise_op_imp<float>(op, vin_deq.val[1]),
+                    elementwise_op_imp<float>(op, vin_deq.val[2]),
+                    elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }
+            };
+            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+            {
+                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+            }
+
+            // Re-quantize to new output space
+            vout = vquantize(vtmp_deq, qi_out);
+            wrapper::vstore(output_ptr + x, vout);
+        }
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+            qasymm8_t tmp   = 0;
+            float     tmp_f = dequantize_qasymm8(in, qi_in);
+            if(tmp_f <= 0.0)
+            {
+                if(op == ElementWiseUnary::LOG)
+                {
+                    tmp_f = (0 - qi_out.offset) * qi_out.scale;
+                }
+                else if(op == ElementWiseUnary::RSQRT)
+                {
+                    tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                }
+            }
+            else
+            {
+                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+            }
+            tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
new file mode 100644
index 0000000000..d987f7747b
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Window.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifndef __aarch64__
+// Fallback function to be used for armv7a, for aarch64 LUT is used
+void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<uint8_t>(in, out, window, op);
+}
+#endif // #ifndef __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..e00970a1e0
--- /dev/null
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Window.h"
+#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+#ifndef __aarch64__
+// Fallback function to be used for armv7a, for aarch64 LUT is used
+void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+{
+    ARM_COMPUTE_UNUSED(lut);
+    return elementwise_op<int8_t>(in, out, window, op);
+}
+#endif // #ifndef __aarch64__
+
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h
index 04c3bb6bcb..c1cfbb8a3a 100644
--- a/src/cpu/kernels/elementwise_unary/list.h
+++ b/src/cpu/kernels/elementwise_unary/list.h
@@ -42,6 +42,8 @@ DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary);
 DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary);
 DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary);
 DECLARE_ELEMETWISE_UNARY_KERNEL(neon_q8_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_qasymm8_signed_elementwise_unary);
+DECLARE_ELEMETWISE_UNARY_KERNEL(neon_qasymm8_elementwise_unary);
 
 #undef DECLARE_ELEMETWISE_UNARY_KERNEL