From 5a5945387e70f62e6e1e95a177fae261d7570443 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 3 Dec 2018 14:30:05 +0000 Subject: COMPMID-1809: Remove padding in NEGEMMConvolutionLayer 64-bit path. Change-Id: I1806591a2c73a1f057f13d8c6107d7b9796a82c8 Reviewed-on: https://review.mlplatform.org/370 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou --- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 551 ++++++++-------------- 1 file changed, 192 insertions(+), 359 deletions(-) (limited to 'src/core/NEON/kernels/NEActivationLayerKernel.cpp') diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 5ce79f1007..97cb9ceb2e 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,7 @@ #include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/NEON/wrapper/wrapper.h" #include "arm_compute/core/QAsymm8.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" @@ -60,29 +61,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - constexpr unsigned int num_elems_processed_per_iteration = 16; - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - bool window_changed = false; + // Configure kernel window + Window win = calculate_max_window(*input, Steps()); - if(output != nullptr && (output->total_size() != 0)) + if(output != nullptr) { - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - - window_changed = update_window_and_padding(win, - AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration), - output_access); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, *input->clone()); - output_access.set_valid_region(win, input->valid_region()); - } - else - { - // In-place computation - window_changed = update_window_and_padding(win, - AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration)); + // NEActivationLayerKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output->set_valid_region(ValidRegion(coord, output->tensor_shape())); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(Status{}, win); } } // namespace @@ -101,15 +94,13 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat if(output != nullptr) { - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), *input->info()->clone()); _output = output; } ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr)); - ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU), + ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU) + && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU), "For QASYMM8 only relu and lower/upper bounded relu are supported"); // Activation functions : FP32 @@ -176,337 +167,129 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat ICPPKernel::configure(win_config.second); } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template -typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) +typename std::enable_if::value, void>::type +NEActivationLayerKernel::activation(const Window &window) { - Iterator input(_input, window); - Iterator output(_output, window); + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const ActivationFunction act = F; - static const float16x8_t CONST_0 = vdupq_n_f16(0.f); - static const float16x8_t CONST_1_H = vdupq_n_f16(1.f); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); - static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f); + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); - const float16x8_t a = vdupq_n_f16(_act_info.a()); - const float16x4_t a_h = vdup_n_f16(_act_info.a()); - const float16x8_t b = vdupq_n_f16(_act_info.b()); + const auto const_1 = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); + const auto const_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + const auto va = wrapper::vdup_n(static_cast(_act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast(_act_info.b()), ExactTagType{}); + const auto a = static_cast(_act_info.a()); + const auto b = static_cast(_act_info.b()); - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const float16x8x2_t in = vld2q_f16(input_ptr); - float16x8x2_t tmp = { {} }; + wrapper::traits::neon_bitvector_t tmp; - switch(F) + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationFunction::ABS: - tmp = - { - { - vabsq_f16(in.val[0]), - vabsq_f16(in.val[1]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_f16(a, vmaxq_f16(CONST_0, in.val[0])), - vminq_f16(a, vmaxq_f16(CONST_0, in.val[1])) - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_f16(a, vmaxq_f16(b, in.val[0])), - vminq_f16(a, vmaxq_f16(b, in.val[1])) - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vaddq_f16(b, vmulq_f16(a, in.val[0])), - vaddq_f16(b, vmulq_f16(a, in.val[1])) - } - }; - break; - case ActivationFunction::LOGISTIC: - { - tmp = - { - { - vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))), - vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1])))) - } - }; - } - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_f16(CONST_0, in.val[0]), - vmaxq_f16(CONST_0, in.val[1]) - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_f16(vcgtq_f16(in.val[0], CONST_0), in.val[0], vmulq_f16(a, in.val[0])), - vbslq_f16(vcgtq_f16(in.val[1], CONST_0), in.val[1], vmulq_f16(a, in.val[1])) - } - }; - break; - case ActivationFunction::SOFT_RELU: - { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x4x2_t in0 = - { - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))), - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[0])))))), - }; - - const float16x4x2_t in1 = - { - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[1])))))), - vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_high_f16(in.val[1])))))), - }; - - tmp = - { - { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), - } - }; - } - break; - case ActivationFunction::SQRT: - tmp = - { - { - vinvq_f16(vinvsqrtq_f16(in.val[0])), - vinvq_f16(vinvsqrtq_f16(in.val[1])), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vmulq_f16(in.val[0], in.val[0]), - vmulq_f16(in.val[1], in.val[1]) - } - }; - break; - case ActivationFunction::TANH: + const auto vin = wrapper::vloadq(input_ptr + x); + switch(act) { - // TODO (COMPMID-1535) : Revisit FP16 approximations - const float16x8x2_t mul = - { - vmulq_f16(b, in.val[0]), - vmulq_f16(b, in.val[1]) - }; - const float16x4x2_t in0 = - { - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[0]))))), - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[0]))))), - }; - - const float16x4x2_t in1 = - { - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_low_f16(mul.val[1]))))), - vmul_f16(a_h, vcvt_f16_f32(vtanhq_f32(vcvt_f32_f16(vget_high_f16(mul.val[1]))))), - }; - - tmp = - { - { - vcombine_f16(in0.val[0], in0.val[1]), - vcombine_f16(in1.val[0], in1.val[1]), - } - }; + case ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationFunction::SOFT_RELU: + tmp = wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))); + break; + case ActivationFunction::SQRT: + tmp = wrapper::vinv(wrapper::vinvsqrt(vin)); + break; + case ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); } - break; - default: - ARM_COMPUTE_ERROR("Not implemented"); - break; + wrapper::vstore(output_ptr + x, tmp); } - vst2q_f16(output_ptr, tmp); - }, - input, output); -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) -{ - Iterator input(_input, window); - Iterator output(_output, window); - - static const float32x4_t CONST_1 = vdupq_n_f32(1.f); - static const float32x4_t CONST_0 = vdupq_n_f32(0.f); - const float32x4_t a = vdupq_n_f32(_act_info.a()); - const float32x4_t b = vdupq_n_f32(_act_info.b()); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const float32x4x4_t in = + // Compute left-over elements + for(; x < window_end_x; ++x) { + const T in = *(reinterpret_cast(input_ptr + x)); + T tmp; + switch(act) { - vld1q_f32(input_ptr), - vld1q_f32(input_ptr + 4), - vld1q_f32(input_ptr + 8), - vld1q_f32(input_ptr + 12) + case ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationFunction::LOGISTIC: + tmp = static_cast(1) / (static_cast(1) + std::exp(-in)); + break; + case ActivationFunction::RELU: + tmp = std::max(static_cast(0), in); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = std::min(a, std::max(static_cast(0), in)); + break; + case ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min(a, std::max(b, in)); + break; + case ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationFunction::SOFT_RELU: + tmp = std::log(static_cast(1) + std::exp(in)); + break; + case ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); } - }; - float32x4x4_t tmp = { {} }; - - switch(F) - { - case ActivationFunction::ABS: - tmp = - { - { - vabsq_f32(in.val[0]), - vabsq_f32(in.val[1]), - vabsq_f32(in.val[2]), - vabsq_f32(in.val[3]), - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vmlaq_f32(b, a, in.val[0]), - vmlaq_f32(b, a, in.val[1]), - vmlaq_f32(b, a, in.val[2]), - vmlaq_f32(b, a, in.val[3]), - } - }; - break; - case ActivationFunction::LOGISTIC: - tmp = - { - { - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))), - vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))), - } - }; - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_f32(CONST_0, in.val[0]), - vmaxq_f32(CONST_0, in.val[1]), - vmaxq_f32(CONST_0, in.val[2]), - vmaxq_f32(CONST_0, in.val[3]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])), - vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])), - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_f32(a, vmaxq_f32(b, in.val[0])), - vminq_f32(a, vmaxq_f32(b, in.val[1])), - vminq_f32(a, vmaxq_f32(b, in.val[2])), - vminq_f32(a, vmaxq_f32(b, in.val[3])), - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_f32(vcgtq_f32(in.val[0], CONST_0), in.val[0], vmulq_f32(a, in.val[0])), - vbslq_f32(vcgtq_f32(in.val[1], CONST_0), in.val[1], vmulq_f32(a, in.val[1])), - vbslq_f32(vcgtq_f32(in.val[2], CONST_0), in.val[2], vmulq_f32(a, in.val[2])), - vbslq_f32(vcgtq_f32(in.val[3], CONST_0), in.val[3], vmulq_f32(a, in.val[3])), - } - }; - break; - case ActivationFunction::SOFT_RELU: - tmp = - { - { - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))), - vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))), - } - }; - break; - case ActivationFunction::SQRT: - tmp = - { - { - vinvq_f32(vinvsqrtq_f32(in.val[0])), - vinvq_f32(vinvsqrtq_f32(in.val[1])), - vinvq_f32(vinvsqrtq_f32(in.val[2])), - vinvq_f32(vinvsqrtq_f32(in.val[3])), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vmulq_f32(in.val[0], in.val[0]), - vmulq_f32(in.val[1], in.val[1]), - vmulq_f32(in.val[2], in.val[2]), - vmulq_f32(in.val[3], in.val[3]), - } - }; - break; - case ActivationFunction::TANH: - tmp = - { - { - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))), - vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))), - } - }; - break; - default: - break; + *(output_ptr + x) = tmp; } - - vst1q_f32(output_ptr, tmp.val[0]); - vst1q_f32(output_ptr + 4, tmp.val[1]); - vst1q_f32(output_ptr + 8, tmp.val[2]); - vst1q_f32(output_ptr + 12, tmp.val[3]); }, input, output); } @@ -514,13 +297,25 @@ typename std::enable_if::value, void>::type NEActivationL template typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) { - Iterator input(_input, window); - Iterator output(_output, window); - const QuantizationInfo qi_in = _input->info()->quantization_info(); - const QuantizationInfo qi_out = _output->info()->quantization_info(); - const qasymm8x16_t a = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset)); - const qasymm8x16_t b = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset)); - const qasymm8x16_t CONST_0 = vdupq_n_u8(sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset)); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const ActivationFunction act = F; + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win_collapsed); + Iterator output(_output, win_collapsed); + + const QuantizationInfo qi_in = _input->info()->quantization_info(); + const QuantizationInfo qi_out = _output->info()->quantization_info(); + const qasymm8x16_t va = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset)); + const qasymm8x16_t vb = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset)); + const qasymm8_t a = sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset); + const qasymm8_t b = sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset); + const qasymm8_t const_0 = sqcvt_qasymm8_f32(0.f, qi_in.scale, qi_in.offset); + const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); // Initialise scale/offset for re-quantization float s = qi_in.scale / qi_out.scale; @@ -528,34 +323,72 @@ typename std::enable_if::value, void>::type NEActivat float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(window, [&](const Coordinates & id) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - const qasymm8x16_t in = vld1q_u8(input_ptr); - qasymm8x16_t tmp = {}; + wrapper::traits::neon_bitvector_t tmp; - switch(F) + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationFunction::LU_BOUNDED_RELU: + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationFunction::RELU) + { // Perform activation - tmp = vminq_u8(a, vmaxq_u8(b, in)); + tmp = vmaxq_u8(vconst_0, vin); // Re-quantize to new output space tmp = vmlaq_qasymm8(tmp, vs, vo); - break; - case ActivationFunction::RELU: + } + else if(act == ActivationFunction::BOUNDED_RELU) + { // Perform activation - tmp = vmaxq_u8(CONST_0, in); + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); // Re-quantize to new output space tmp = vmlaq_qasymm8(tmp, vs, vo); - break; - default: - ARM_COMPUTE_ERROR("Function not implemented"); - break; + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - vst1q_u8(output_ptr, tmp); + // Compute left-over elements + for(; x < window_end_x; ++x) + { + T in = *(reinterpret_cast(input_ptr + x)); + T tmp; + if(act == ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = std::max(0, std::min(static_cast(tmp * s + o), 255)); + } + else if(act == ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = std::max(0, std::min(static_cast(tmp * s + o), 255)); + } + else if(act == ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = std::max(0, std::min(static_cast(tmp * s + o), 255)); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } }, input, output); } -- cgit v1.2.1