diff options
Diffstat (limited to 'src/cpu/kernels/addmuladd')
-rw-r--r-- | src/cpu/kernels/addmuladd/generic/neon/fp16.cpp | 106 | ||||
-rw-r--r-- | src/cpu/kernels/addmuladd/generic/neon/fp32.cpp | 104 | ||||
-rw-r--r-- | src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp | 137 | ||||
-rw-r--r-- | src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp | 137 | ||||
-rw-r--r-- | src/cpu/kernels/addmuladd/list.h | 5 |
5 files changed, 266 insertions, 223 deletions
diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp index d8e5f694a8..b4b81aa78b 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/CpuTypes.h" #include <cstddef> @@ -38,16 +39,20 @@ namespace { using arm_compute::float16_t; -void a64_add_bn_clamp_direct_fp16_2x32( - float16_t *out, size_t out_stride, - float16_t *out_direct, size_t out_direct_stride, - const float16_t *in0, size_t in0_stride, - const float16_t *in1, size_t in1_stride, - const float16_t *bn_mul, - const float16_t *bn_add, - const float16_t minval, - const float16_t maxval, - size_t width, size_t height) +void a64_add_bn_clamp_direct_fp16_2x32(float16_t *out, + size_t out_stride, + float16_t *out_direct, + size_t out_direct_stride, + const float16_t *in0, + size_t in0_stride, + const float16_t *in1, + size_t in1_stride, + const float16_t *bn_mul, + const float16_t *bn_add, + const float16_t minval, + const float16_t maxval, + size_t width, + size_t height) { struct KernelArgs { @@ -858,9 +863,14 @@ void a64_add_bn_clamp_direct_fp16_2x32( "subs x20, x20, #0x2\n" "bgt 8b\n" "58:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -869,8 +879,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_fp16_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -882,16 +899,16 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I float16_t minval = std::numeric_limits<half>::lowest(); float16_t maxval = std::numeric_limits<half>::max(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = static_cast<float16_t>(0.f); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = static_cast<float16_t>(0.f); maxval = static_cast<float16_t>(act_info.a()); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = static_cast<float16_t>(act_info.b()); maxval = static_cast<float16_t>(act_info.a()); @@ -909,42 +926,37 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp16_2x32( - reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, - reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float16_t *>(bn_mul->buffer()), - reinterpret_cast<float16_t *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, + reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride, + reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, + reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, + reinterpret_cast<float16_t *>(bn_mul->buffer()), + reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp16_2x32( - reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float16_t *>(bn_mul->buffer()), - reinterpret_cast<float16_t *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr, + out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()), + in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, + reinterpret_cast<float16_t *>(bn_mul->buffer()), + reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp index b0c487ec56..f0444b6acd 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp @@ -35,16 +35,20 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_fp32_2x16( - float *out, size_t out_stride, - float *out_direct, size_t out_direct_stride, - const float *in0, size_t in0_stride, - const float *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const float minval, - const float maxval, - size_t width, size_t height) +void a64_add_bn_clamp_direct_fp32_2x16(float *out, + size_t out_stride, + float *out_direct, + size_t out_direct_stride, + const float *in0, + size_t in0_stride, + const float *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const float minval, + const float maxval, + size_t width, + size_t height) { struct KernelArgs { @@ -631,18 +635,30 @@ void a64_add_bn_clamp_direct_fp32_2x16( "subs x20, x20, #0x2\n" "bgt 8b\n" "34:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); -} + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } +} // namespace namespace arm_compute { namespace cpu { -void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_fp32_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -654,16 +670,16 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I float minval = std::numeric_limits<float>::lowest(); float maxval = std::numeric_limits<float>::max(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = 0.f; } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = 0.f; maxval = act_info.a(); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = act_info.b(); maxval = act_info.a(); @@ -681,42 +697,34 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp32_2x16( - reinterpret_cast<float *>(out_it.ptr()), out_stride, - reinterpret_cast<float *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<float *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float *>(bn_mul->buffer()), - reinterpret_cast<float *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()), + out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride, + reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()), + reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp32_2x16( - reinterpret_cast<float *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<float *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float *>(bn_mul->buffer()), - reinterpret_cast<float *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()), + in1_stride, reinterpret_cast<float *>(bn_mul->buffer()), + reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp index f7448a6717..035805c944 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp @@ -36,22 +36,30 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_u8_fp32_2x16( - uint8_t *out, size_t out_stride, - uint8_t *out_direct, size_t out_direct_stride, - const uint8_t *in0, size_t in0_stride, - const uint8_t *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const uint8_t minval, - const uint8_t maxval, - int32_t out_zeropt, float out_scale, - int32_t out_direct_zeropt, float out_direct_scale, - int32_t in0_zeropt, float in0_scale, - int32_t in1_zeropt, float in1_scale, - size_t width, size_t height) +void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t *out, + size_t out_stride, + uint8_t *out_direct, + size_t out_direct_stride, + const uint8_t *in0, + size_t in0_stride, + const uint8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const uint8_t minval, + const uint8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) { - float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale }; + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; struct KernelArgs { const float *scales; @@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_u8_fp32_2x16( "subs x23, x23, #0x2\n" "bgt 6b\n" "32:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -720,8 +738,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_u8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -739,24 +764,25 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe uint8_t maxval = std::numeric_limits<uint8_t>::max(); const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = quantize_qasymm8(0.f, final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = quantize_qasymm8(0.f, final_output_qinfo); maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = quantize_qasymm8(act_info.b(), final_output_qinfo); maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); } - const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); - const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); - const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); const int32_t in1_offset = in1_qinfo.offset; const int32_t in2_offset = in2_qinfo.offset; @@ -783,50 +809,35 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_u8_fp32_2x16( - reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, - reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, + reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride, + reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_u8_fp32_2x16( - reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp index 1ae2cb76a9..e1a45b467b 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp @@ -36,22 +36,30 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_s8_fp32_2x16( - int8_t *out, size_t out_stride, - int8_t *out_direct, size_t out_direct_stride, - const int8_t *in0, size_t in0_stride, - const int8_t *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const int8_t minval, - const int8_t maxval, - int32_t out_zeropt, float out_scale, - int32_t out_direct_zeropt, float out_direct_scale, - int32_t in0_zeropt, float in0_scale, - int32_t in1_zeropt, float in1_scale, - size_t width, size_t height) +void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t *out, + size_t out_stride, + int8_t *out_direct, + size_t out_direct_stride, + const int8_t *in0, + size_t in0_stride, + const int8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const int8_t minval, + const int8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) { - float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale }; + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; struct KernelArgs { const float *scales; @@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_s8_fp32_2x16( "subs x23, x23, #0x2\n" "bgt 6b\n" "32:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -720,8 +738,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_s8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -739,24 +764,25 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe int8_t maxval = std::numeric_limits<int8_t>::max(); const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = quantize_qasymm8_signed(0.f, final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = quantize_qasymm8_signed(0.f, final_output_qinfo); maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo); maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); } - const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); - const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); - const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); const int32_t in1_offset = in1_qinfo.offset; const int32_t in2_offset = in2_qinfo.offset; @@ -783,50 +809,35 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_s8_fp32_2x16( - reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, - reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()), + out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, + reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, + out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset, + in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_s8_fp32_2x16( - reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h index a7c22c06d8..568003a916 100644 --- a/src/cpu/kernels/addmuladd/list.h +++ b/src/cpu/kernels/addmuladd/list.h @@ -32,9 +32,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ +#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \ - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) + ITensor *add_output, ITensor *final_output, ConvertPolicy policy, \ + const ActivationLayerInfo &act_info, const Window &window) DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon); DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon); |