From 0dc0d8eda87a01c11f9caabc0d2f2933737ba469 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 30 Apr 2021 03:18:37 +0100 Subject: Rename PixelwiseMultiplications to Mul for simplicity Changes the names of the following: - PixelWiseMultiplicationKernel to MulKernel for all backends - PixelWiseMultiplication to Mul for all backends Signed-off-by: Georgios Pinitas Change-Id: I88108c2d22c888fce37ea1028863026160b9da97 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5534 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Comments-Addressed: Arm Jenkins --- Android.bp | 8 +- SConscript | 4 +- .../CL/functions/CLPixelWiseMultiplication.h | 4 +- .../NEON/functions/NEPixelWiseMultiplication.h | 4 +- src/core/cpu/kernels/CpuMulKernel.cpp | 1729 ++++++++++++++++++++ src/core/cpu/kernels/CpuMulKernel.h | 150 ++ .../kernels/CpuPixelWiseMultiplicationKernel.cpp | 1729 -------------------- .../cpu/kernels/CpuPixelWiseMultiplicationKernel.h | 175 -- src/core/gpu/cl/kernels/ClMulKernel.cpp | 398 +++++ src/core/gpu/cl/kernels/ClMulKernel.h | 115 ++ .../cl/kernels/ClPixelWiseMultiplicationKernel.cpp | 398 ----- .../cl/kernels/ClPixelWiseMultiplicationKernel.h | 139 -- .../CL/functions/CLPixelWiseMultiplication.cpp | 26 +- .../NEON/functions/NEPixelWiseMultiplication.cpp | 26 +- src/runtime/cpu/operators/CpuMul.cpp | 77 + src/runtime/cpu/operators/CpuMul.h | 109 ++ .../cpu/operators/CpuPixelWiseMultiplication.cpp | 77 - .../cpu/operators/CpuPixelWiseMultiplication.h | 133 -- src/runtime/gpu/cl/operators/ClMul.cpp | 60 + src/runtime/gpu/cl/operators/ClMul.h | 107 ++ .../gpu/cl/operators/ClPixelWiseMultiplication.cpp | 60 - .../gpu/cl/operators/ClPixelWiseMultiplication.h | 132 -- 22 files changed, 2781 insertions(+), 2879 deletions(-) create mode 100644 src/core/cpu/kernels/CpuMulKernel.cpp create mode 100644 src/core/cpu/kernels/CpuMulKernel.h delete mode 100644 src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp delete mode 100644 src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h create mode 100644 src/core/gpu/cl/kernels/ClMulKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClMulKernel.h delete mode 100644 src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp delete mode 100644 src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h create mode 100644 src/runtime/cpu/operators/CpuMul.cpp create mode 100644 src/runtime/cpu/operators/CpuMul.h delete mode 100644 src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp delete mode 100644 src/runtime/cpu/operators/CpuPixelWiseMultiplication.h create mode 100644 src/runtime/gpu/cl/operators/ClMul.cpp create mode 100644 src/runtime/gpu/cl/operators/ClMul.h delete mode 100644 src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp delete mode 100644 src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h diff --git a/Android.bp b/Android.bp index 78ac7f130f..046b1c08a5 100644 --- a/Android.bp +++ b/Android.bp @@ -306,8 +306,8 @@ cc_library_static { "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp", "src/core/cpu/kernels/CpuFillKernel.cpp", "src/core/cpu/kernels/CpuFloorKernel.cpp", + "src/core/cpu/kernels/CpuMulKernel.cpp", "src/core/cpu/kernels/CpuPermuteKernel.cpp", - "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp", "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp", "src/core/cpu/kernels/CpuPoolingKernel.cpp", "src/core/cpu/kernels/CpuQuantizationKernel.cpp", @@ -367,8 +367,8 @@ cc_library_static { "src/core/gpu/cl/kernels/ClFillKernel.cpp", "src/core/gpu/cl/kernels/ClFloorKernel.cpp", "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp", + "src/core/gpu/cl/kernels/ClMulKernel.cpp", "src/core/gpu/cl/kernels/ClPermuteKernel.cpp", - "src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp", "src/core/gpu/cl/kernels/ClPoolingKernel.cpp", "src/core/gpu/cl/kernels/ClQuantizationKernel.cpp", "src/core/gpu/cl/kernels/ClReshapeKernel.cpp", @@ -640,8 +640,8 @@ cc_library_static { "src/runtime/cpu/operators/CpuElementwiseUnary.cpp", "src/runtime/cpu/operators/CpuFill.cpp", "src/runtime/cpu/operators/CpuFloor.cpp", + "src/runtime/cpu/operators/CpuMul.cpp", "src/runtime/cpu/operators/CpuPermute.cpp", - "src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp", "src/runtime/cpu/operators/CpuPooling.cpp", "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp", "src/runtime/cpu/operators/CpuQuantization.cpp", @@ -663,9 +663,9 @@ cc_library_static { "src/runtime/gpu/cl/operators/ClFill.cpp", "src/runtime/gpu/cl/operators/ClFloor.cpp", "src/runtime/gpu/cl/operators/ClLogicalNot.cpp", + "src/runtime/gpu/cl/operators/ClMul.cpp", "src/runtime/gpu/cl/operators/ClPRelu.cpp", "src/runtime/gpu/cl/operators/ClPermute.cpp", - "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp", "src/runtime/gpu/cl/operators/ClPooling.cpp", "src/runtime/gpu/cl/operators/ClQuantization.cpp", "src/runtime/gpu/cl/operators/ClReshape.cpp", diff --git a/SConscript b/SConscript index 63c2a483dc..da92409867 100644 --- a/SConscript +++ b/SConscript @@ -312,7 +312,7 @@ if env['neon']: 'src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp', 'src/core/cpu/kernels/CpuFillKernel.cpp', 'src/core/cpu/kernels/CpuFloorKernel.cpp', - 'src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp', + 'src/core/cpu/kernels/CpuMulKernel.cpp', 'src/core/cpu/kernels/CpuQuantizationKernel.cpp', 'src/core/cpu/kernels/CpuScaleKernel.cpp', 'src/core/cpu/kernels/CpuSoftmaxKernel.cpp', @@ -359,7 +359,7 @@ if env['neon']: 'src/runtime/cpu/operators/CpuElementwiseUnary.cpp', 'src/runtime/cpu/operators/CpuFill.cpp', 'src/runtime/cpu/operators/CpuFloor.cpp', - 'src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp', + 'src/runtime/cpu/operators/CpuMul.cpp', 'src/runtime/cpu/operators/CpuQuantization.cpp', 'src/runtime/cpu/operators/CpuReshape.cpp', 'src/runtime/cpu/operators/CpuScale.cpp', diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h index 14422383ad..029b924512 100644 --- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h +++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h @@ -34,7 +34,7 @@ class CLCompileContext; class ICLTensor; class ITensorInfo; -/** Basic function to run @ref opencl::ClPixelWiseMultiplication. */ +/** Basic function to run @ref opencl::ClMul. */ class CLPixelWiseMultiplication : public IFunction { public: @@ -123,7 +123,7 @@ private: std::unique_ptr _impl; }; -/** Basic function to run @ref opencl::ClComplexPixelWiseMultiplication. */ +/** Basic function to run @ref opencl::ClComplexMul. */ class CLComplexPixelWiseMultiplication : public IFunction { public: diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index f8074e791a..4684c2d4b8 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -34,7 +34,7 @@ namespace arm_compute class ITensor; class ITensorInfo; -/** Basic function to run @ref cpu::CpuPixelWiseMultiplication */ +/** Basic function to run @ref cpu::CpuMul */ class NEPixelWiseMultiplication : public IFunction { public: @@ -131,7 +131,7 @@ private: std::unique_ptr _impl; }; -/** Basic function to run @ref cpu::CpuComplexPixelWiseMultiplication. */ +/** Basic function to run @ref cpu::CpuComplexMul. */ class NEComplexPixelWiseMultiplication : public IFunction { public: diff --git a/src/core/cpu/kernels/CpuMulKernel.cpp b/src/core/cpu/kernels/CpuMulKernel.cpp new file mode 100644 index 0000000000..dabf656e6e --- /dev/null +++ b/src/core/cpu/kernels/CpuMulKernel.cpp @@ -0,0 +1,1729 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/cpu/kernels/CpuMulKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +const float scale255_constant = 1.f / 255.f; +const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); +const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); + +inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, + DataType::S32, DataType::F16, DataType::F32); + if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized"); + } + + if(dst->total_size() > 0) + { + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + // clang-format off + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) && + !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) + , "Invalid data type combination"); + // clang-format on + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst"); + } + + if(std::abs(scale - scale255_constant) < 0.00001f) + { + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32, + "Scale == 1/255 is not supported if input and dst are of data type S32"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO); + + int exponent = 0; + const float normalized_mantissa = std::frexp(scale, &exponent); + + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255"); + } + + return Status{}; +} + +/* Scales a given vector by 1/255. + * + * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats. + * + * @param in Input vector to scale. + * @return Scaled dst rounded to nearest (round half up). + */ +inline int32x4_t scale255_S32_S32(int32x4_t in) +{ + // Scale + const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q); + // Round to nearest (round half up) + // Add +0.5 for all values + // Afterwards vcvt rounds toward zero + return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q)); +} + +inline uint16x8_t scale255_U16_U16(uint16x8_t in) +{ + const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in)))); + const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in)))); + return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1))); +} + +template +inline typename std::enable_if::value, int8x16_t>::type +vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) +{ + return vquantize_signed(val, info); +} + +template +inline typename std::enable_if::value, uint8x16_t>::type +vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) +{ + return vquantize(val, info); +} + +template +void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); + const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); + const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(non_broadcast_input_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, non_broadcast_qinfo); + const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto input1_q = wrapper::vloadq(input1_ptr + x); + const auto input2_q = wrapper::vloadq(input2_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(input1_ptr + x); + const T src2 = *(input2_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, input1_qua_info); + const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(src2, input2_qua_info); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); + } +} + +void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); + const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); + + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const qsymm16x8x2_t input1_q = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const qsymm16x8x2_t input2_q = + { + { + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + } + }; + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + // Dequantize inputs + float tmp_in1 = static_cast(*(input1_ptr + x)) * input1_qua_info.scale; + float tmp_in2 = static_cast(*(input2_ptr + x)) * input2_qua_info.scale; + float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst, lrintf() has same rounding mode as vcombine_s16 + int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); + qsymm16_t tmp_qua = static_cast(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); +} + +void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) +{ + ARM_COMPUTE_UNUSED(scale); + + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const qsymm16x8x2_t input1_q = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const qsymm16x8x2_t input2_q = + { + { + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + } + }; + + const int32x4x4_t in1_s32 = + { + { + vmovl_s16(vget_low_s16(input1_q.val[0])), + vmovl_s16(vget_high_s16(input1_q.val[0])), + vmovl_s16(vget_low_s16(input1_q.val[1])), + vmovl_s16(vget_high_s16(input1_q.val[1])), + } + }; + const int32x4x4_t in2_s32 = + { + { + vmovl_s16(vget_low_s16(input2_q.val[0])), + vmovl_s16(vget_high_s16(input2_q.val[0])), + vmovl_s16(vget_low_s16(input2_q.val[1])), + vmovl_s16(vget_high_s16(input2_q.val[1])), + } + }; + + const int32x4x4_t result = + { + { + vmulq_s32(in1_s32.val[0], in2_s32.val[0]), + vmulq_s32(in1_s32.val[1], in2_s32.val[1]), + vmulq_s32(in1_s32.val[2], in2_s32.val[2]), + vmulq_s32(in1_s32.val[3], in2_s32.val[3]), + } + }; + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + vst1q_s32(output_ptr + x + 8, result.val[2]); + vst1q_s32(output_ptr + x + 12, result.val[3]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + *(output_ptr + x) = tmp; + } + }, + input1, input2, dst); +} + +template +void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16 / sizeof(uint8_t); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); + const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); + + uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); + const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); + uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); + const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); + + tmp1_high = vmulq_u16(tmp1_high, tmp2_high); + tmp1_low = vmulq_u16(tmp1_low, tmp2_low); + + if(is_scale255) + { + tmp1_high = scale255_U16_U16(tmp1_high); + tmp1_low = scale255_U16_U16(tmp1_low); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if(is_sat) + { + tmp1_high = vqshlq_u16(tmp1_high, vn); + tmp1_low = vqshlq_u16(tmp1_low, vn); + } + else + { + tmp1_high = vshlq_u16(tmp1_high, vn); + tmp1_low = vshlq_u16(tmp1_low, vn); + } + } + if(is_sat) + { + vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); + } + else + { + vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); + } + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + uint16_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast(tmp) * scale255_constant; + tmp = static_cast(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + if(is_sat && tmp > 255) + { + tmp = 255; + } + *(output_ptr + x) = static_cast(tmp); + } + }, + input1, input2, dst); +} + +template +inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n) +{ + int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1)); + const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2)); + int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1)); + const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2)); + + tmp1_high = vmulq_s32(tmp1_high, tmp2_high); + tmp1_low = vmulq_s32(tmp1_low, tmp2_low); + + if(is_scale255) + { + tmp1_high = scale255_S32_S32(tmp1_high); + tmp1_low = scale255_S32_S32(tmp1_low); + } + else + { + // Right shift amount + const int32x4_t vn = vdupq_n_s32(-n); + // Left shift amount + const int32x4_t vnl = vdupq_n_s32(n); + // Calculate conversion bit + const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high); + const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low); + const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31); + const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31); + const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high); + const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); + const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); + const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); + if(is_sat) + { + tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + else + { + tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + } + + if(is_sat) + { + return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); + } + else + { + return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high)); + } +} + +template +inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) +{ + const int16x8x2_t result = + { + { + // First 8 elements + mul_S16_S16_S16_n_loop(src1.val[0], src2.val[0], n), + // Second 8 elements + mul_S16_S16_S16_n_loop(src1.val[1], src2.val[1], n) + } + }; + + return result; +} + +template +void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const int16x8x2_t ta2 = + { + { + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + } + }; + const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast(tmp) * scale255_constant; + + tmp = static_cast(tmp_f + 0.5f); + } + else + { + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } + } + if(is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast(tmp); + } + }, + input1, input2, dst); +} + +template +inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n) +{ + const int32x2_t input1_1 = vget_low_s32(src1); + const int32x2_t input2_1 = vget_low_s32(src2); + const int32x2_t input1_2 = vget_high_s32(src1); + const int32x2_t input2_2 = vget_high_s32(src2); + + int64x2_t tmp_1 = vmull_s32(input1_1, input2_1); + int64x2_t tmp_2 = vmull_s32(input1_2, input2_2); + + // Apply scaling, conversion and rounding (round to zero) + // Right shift amount + const int64x2_t vn = vdupq_n_s64(-n); + // Left shift amount + const int64x2_t vnl = vdupq_n_s64(n); + // Calculate conversion bit + const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1); + const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63); + const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1); + const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s); + + const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2); + const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); + const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); + const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); + if(is_sat) + { + tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); + tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); + return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2)); + } + else + { + tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn); + tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn); + return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2)); + } +} + +template +inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) +{ + const int32x4x2_t result = + { + { + // First 4 elements + mul_S32_S32_S32_n_loop(src1.val[0], src2.val[0], n), + // Second 4 elements + mul_S32_S32_S32_n_loop(src1.val[1], src2.val[1], n) + } + }; + + return result; +} + +template +void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + const int32_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t broadcast_v = + { + { + broadcast_value_vec, + broadcast_value_vec, + } + }; + const int32x4x2_t non_broadcast_v = + { + { + vld1q_s32(non_broadcast_input_ptr + x), + vld1q_s32(non_broadcast_input_ptr + x + 4), + } + }; + const int32x4x2_t result = mul_S32_S32_S32_n_k(broadcast_v, non_broadcast_v, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int64_t tmp = static_cast(broadcast_value) * static_cast(*(non_broadcast_input_ptr + x)); + + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = (1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } + if(is_sat) + { + tmp = utility::clamp(tmp); + } + *(output_ptr + x) = static_cast(tmp); + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t ta1 = + { + { + vld1q_s32(input1_ptr + x), + vld1q_s32(input1_ptr + x + 4), + } + }; + const int32x4x2_t ta2 = + { + { + vld1q_s32(input2_ptr + x), + vld1q_s32(input2_ptr + x + 4), + } + }; + const int32x4x2_t result = mul_S32_S32_S32_n_k(ta1, ta2, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int64_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = (1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } + if(is_sat) + { + tmp = utility::clamp(tmp); + } + *(output_ptr + x) = static_cast(tmp); + } + }, + input1, input2, dst); + } +} + +void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(float); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto ta1 = wrapper::vloadq(input1_ptr + x); + const auto ta2 = wrapper::vloadq(input2_ptr + x); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} + +void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 8 / sizeof(float); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector::tag_type; + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); + float32x4_t b = vdupq_n_f32(broadcast_value); + + const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + + float32x4_t res = wrapper::vmul(tmp0, b); + b = wrapper::vmul(b, mask); + + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); + const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); + auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); + auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); + float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); + + const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + + float32x4_t res = wrapper::vmul(tmp0, b); + + b = wrapper::vrev64(b); + b = wrapper::vmul(b, mask); + + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto a0 = *(input1_ptr + 2 * x); + const auto a1 = *(input1_ptr + 2 * x + 1); + const auto b0 = *(input2_ptr + 2 * x); + const auto b1 = *(input2_ptr + 2 * x + 1); + auto res1 = a0 * b0 - a1 * b1; + auto res2 = a0 * b1 + a1 * b0; + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + input1, input2, dst); + } +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + constexpr int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float16x8x2_t broadcast_value_vec = + { + { + vdupq_n_f16(broadcast_value), + vdupq_n_f16(broadcast_value), + } + }; + const auto scale_vec = vdupq_n_f16(scale); + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t non_broadcast_v = + { + { + vld1q_f16(non_broadcast_input_ptr + x), + vld1q_f16(non_broadcast_input_ptr + x + 8), + } + }; + const float16x8x2_t result = + { + { + vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), + vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), + } + }; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t ta1 = + { + { + vld1q_f16(input1_ptr + x), + vld1q_f16(input1_ptr + x + 8), + } + }; + const float16x8x2_t ta2 = + { + { + vld1q_f16(input2_ptr + x), + vld1q_f16(input2_ptr + x + 8), + } + }; + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = + { + { + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + } + }; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template +void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16 / sizeof(uint8_t); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); + const uint8x16_t av = wrapper::vloadq(input1_ptr + x); + + uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); + uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); + tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); + tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + + if(is_scale255) + { + tmp_low = scale255_U16_U16(tmp_low); + tmp_high = scale255_U16_U16(tmp_high); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if(is_sat) + { + tmp_low = vqshlq_u16(tmp_low, vn); + tmp_high = vqshlq_u16(tmp_high, vn); + } + else + { + tmp_low = vshlq_u16(tmp_low, vn); + tmp_high = vshlq_u16(tmp_high, vn); + } + } + + if(is_sat) + { + static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); + + tmp_low = vminq_u16(tmp_low, max); + tmp_high = vminq_u16(tmp_high, max); + } + + vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); + vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast(tmp) * scale255_constant; + tmp = static_cast(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + + if(is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + } + + *(output_ptr + x) = static_cast(tmp); + } + }, + input1, input2, dst); +} + +template +void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const uint8x8x2_t ta2u = + { + { + vld1_u8(input2_ptr + x), + vld1_u8(input2_ptr + x + 8), + } + }; + const int16x8x2_t ta2 = + { + { + vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), + vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) + } + }; + + const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast(tmp) * scale255_constant; + + tmp = static_cast(tmp_f + 0.5f); + } + else + { + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast(mask)) >> n; + } + } + if(is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast(tmp); + } + }, + input1, input2, dst); +} + +template +void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Simply swap the two input buffers + mul_S16_U8_S16(src2, src1, out, window, n); +} +} // namespace + +void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(rounding_policy); + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + // Auto initialize dst if not initialized + set_shape_if_empty(*dst, out_shape); + + _scale = scale; + _scale_exponent = 0; + _func_quantized = nullptr; + _func_int = nullptr; + _func_float = nullptr; + + bool is_scale_255 = false; + // Check and validate scaling factor + if(std::abs(scale - scale255_constant) < 0.00001f) + { + is_scale_255 = true; + } + else + { + int exponent = 0; + + std::frexp(scale, &exponent); + + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + _scale_exponent = std::abs(exponent - 1); + } + + const DataType dt_input1 = src1->data_type(); + const DataType dt_input2 = src2->data_type(); + const DataType dt_output = dst->data_type(); + const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); + + switch(dt_input1) + { + case DataType::QASYMM8: + if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) + { + _func_quantized = &mul_saturate_quantized_8; + } + break; + case DataType::QASYMM8_SIGNED: + if(dt_input2 == DataType::QASYMM8_SIGNED) + { + _func_quantized = &mul_saturate_quantized_8; + ; + } + break; + case DataType::QSYMM16: + if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) + { + _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; + } + else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) + { + _func_int = &mul_QSYMM16_QSYMM16_S32; + } + break; + case DataType::S16: + if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; + } + else + { + _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; + } + } + if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; + } + else + { + _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; + } + } + break; + case DataType::S32: + if(DataType::S32 == dt_input2 && DataType::S32 == dt_output) + { + _func_int = is_sat ? &mul_S32_S32_S32 : &mul_S32_S32_S32; + } + break; + case DataType::U8: + if(DataType::U8 == dt_input2 && DataType::U8 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; + } + else + { + _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; + } + } + else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; + } + else + { + _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; + } + } + else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; + } + else + { + _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; + } + } + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func_float = &mul_F16_F16_F16; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + _func_float = &mul_F32_F32_F32; + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong img formats"); + } + + // Configure kernel window + Window win = calculate_max_window(out_shape); + + ICpuKernel::configure(win); +} + +Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); + + return Status{}; +} + +void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if(_func_quantized != nullptr) + { + (*_func_quantized)(src1, src2, dst, window, _scale); + } + else if(_func_int != nullptr) + { + (*_func_int)(src1, src2, dst, window, _scale_exponent); + } + else + { + ARM_COMPUTE_ERROR_ON(_func_float == nullptr); + (*_func_float)(src1, src2, dst, window, _scale); + } +} +const char *CpuMulKernel::name() const +{ + return "CpuMulKernel"; +} +namespace +{ +Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + } + + return Status{}; +} +} // namespace + +void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + // Auto initialize dst if not initialized + const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type()); + auto_init_if_empty(*dst, out_info); + + // Configure kernel window + Window win = calculate_max_window(out_shape); + + ICpuKernel::configure(win); +} + +Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst)); + + return Status{}; +} + +void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + c_mul_F32_F32_F32_n(src1, src2, dst, window); +} + +const char *CpuComplexMulKernel::name() const +{ + return "CpuComplexMulKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuMulKernel.h b/src/core/cpu/kernels/CpuMulKernel.h new file mode 100644 index 0000000000..3e667bc4be --- /dev/null +++ b/src/core/cpu/kernels/CpuMulKernel.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H +#define ARM_COMPUTE_CPU_MUL_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform multiplication between two tensors */ +class CpuMulKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel); + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (Src1,Src2) -> Dst : + * + * Support: Broadcast? Scale=1/255? + * - (U8,U8) -> U8, S16 N Y + * - (U8,S16) -> S16 N Y + * - (S16,U8) -> S16 N Y + * - (S16,S16) -> S16 N Y + * - (S32,S32) -> S32 Y N + * - (F16,F16) -> F16 N Y + * - (F32,F32) -> F32 Y Y + * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y + * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype + * @param[in] rounding_policy Rounding policy. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + + // Inherited methods overridden + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the specialised multiplication functions with integer scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Integer scale factor. + */ + using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); + /** Common signature for all the specialised multiplication functions with float scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Float scale factor. + */ + using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Float scale factor. + * + */ + using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + + MulFunctionFloat *_func_float{ nullptr }; + MulFunctionInt *_func_int{ nullptr }; + MulFunctionQuantized *_func_quantized{ nullptr }; + float _scale{ 0 }; + int _scale_exponent{ 0 }; +}; + +/** Interface for the complex pixelwise multiplication kernel. */ +class CpuComplexMulKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuComplexMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel); + /** Initialise the kernel's src, dst and border mode. + * + * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuComplexMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */ diff --git a/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp b/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp deleted file mode 100644 index 91b7552ecf..0000000000 --- a/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp +++ /dev/null @@ -1,1729 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/NESymm.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -namespace -{ -const float scale255_constant = 1.f / 255.f; -const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); -const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); - -inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_UNUSED(overflow_policy); - ARM_COMPUTE_UNUSED(rounding_policy); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, - DataType::S32, DataType::F16, DataType::F32); - if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized"); - } - - if(dst->total_size() > 0) - { - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - // clang-format off - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) && - !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && - !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) - , "Invalid data type combination"); - // clang-format on - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst"); - } - - if(std::abs(scale - scale255_constant) < 0.00001f) - { - ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32, - "Scale == 1/255 is not supported if input and dst are of data type S32"); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO); - - int exponent = 0; - const float normalized_mantissa = std::frexp(scale, &exponent); - - // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 - // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 - // Moreover, it will be negative as we deal with 1/2^n - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255"); - } - - return Status{}; -} - -/* Scales a given vector by 1/255. - * - * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats. - * - * @param in Input vector to scale. - * @return Scaled dst rounded to nearest (round half up). - */ -inline int32x4_t scale255_S32_S32(int32x4_t in) -{ - // Scale - const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q); - // Round to nearest (round half up) - // Add +0.5 for all values - // Afterwards vcvt rounds toward zero - return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q)); -} - -inline uint16x8_t scale255_U16_U16(uint16x8_t in) -{ - const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in)))); - const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in)))); - return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1))); -} - -template -inline typename std::enable_if::value, int8x16_t>::type -vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) -{ - return vquantize_signed(val, info); -} - -template -inline typename std::enable_if::value, uint8x16_t>::type -vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) -{ - return vquantize(val, info); -} - -template -void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); - const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); - - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(non_broadcast_input_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, non_broadcast_qinfo); - const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); - - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto input1_q = wrapper::vloadq(input1_ptr + x); - const auto input2_q = wrapper::vloadq(input2_ptr + x); - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); - - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(input1_ptr + x); - const T src2 = *(input2_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper::dequantize(src1, input1_qua_info); - const float tmp_in2 = Qasymm8QuantizationHelper::dequantize(src2, input2_qua_info); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); - } -} - -void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); - const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); - const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); - - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const qsymm16x8x2_t input1_q = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); - - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - float tmp_in1 = static_cast(*(input1_ptr + x)) * input1_qua_info.scale; - float tmp_in2 = static_cast(*(input2_ptr + x)) * input2_qua_info.scale; - float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst, lrintf() has same rounding mode as vcombine_s16 - int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); - qsymm16_t tmp_qua = static_cast(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); -} - -void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) -{ - ARM_COMPUTE_UNUSED(scale); - - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const qsymm16x8x2_t input1_q = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - - const int32x4x4_t in1_s32 = - { - { - vmovl_s16(vget_low_s16(input1_q.val[0])), - vmovl_s16(vget_high_s16(input1_q.val[0])), - vmovl_s16(vget_low_s16(input1_q.val[1])), - vmovl_s16(vget_high_s16(input1_q.val[1])), - } - }; - const int32x4x4_t in2_s32 = - { - { - vmovl_s16(vget_low_s16(input2_q.val[0])), - vmovl_s16(vget_high_s16(input2_q.val[0])), - vmovl_s16(vget_low_s16(input2_q.val[1])), - vmovl_s16(vget_high_s16(input2_q.val[1])), - } - }; - - const int32x4x4_t result = - { - { - vmulq_s32(in1_s32.val[0], in2_s32.val[0]), - vmulq_s32(in1_s32.val[1], in2_s32.val[1]), - vmulq_s32(in1_s32.val[2], in2_s32.val[2]), - vmulq_s32(in1_s32.val[3], in2_s32.val[3]), - } - }; - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - vst1q_s32(output_ptr + x + 8, result.val[2]); - vst1q_s32(output_ptr + x + 12, result.val[3]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - *(output_ptr + x) = tmp; - } - }, - input1, input2, dst); -} - -template -void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16 / sizeof(uint8_t); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); - const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); - - uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); - const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); - uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); - const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); - - tmp1_high = vmulq_u16(tmp1_high, tmp2_high); - tmp1_low = vmulq_u16(tmp1_low, tmp2_low); - - if(is_scale255) - { - tmp1_high = scale255_U16_U16(tmp1_high); - tmp1_low = scale255_U16_U16(tmp1_low); - } - else - { - const int16x8_t vn = vdupq_n_s16(-n); - - if(is_sat) - { - tmp1_high = vqshlq_u16(tmp1_high, vn); - tmp1_low = vqshlq_u16(tmp1_low, vn); - } - else - { - tmp1_high = vshlq_u16(tmp1_high, vn); - tmp1_low = vshlq_u16(tmp1_low, vn); - } - } - if(is_sat) - { - vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); - } - else - { - vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); - } - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - uint16_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - tmp = static_cast(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } - if(is_sat && tmp > 255) - { - tmp = 255; - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n) -{ - int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1)); - const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2)); - int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1)); - const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2)); - - tmp1_high = vmulq_s32(tmp1_high, tmp2_high); - tmp1_low = vmulq_s32(tmp1_low, tmp2_low); - - if(is_scale255) - { - tmp1_high = scale255_S32_S32(tmp1_high); - tmp1_low = scale255_S32_S32(tmp1_low); - } - else - { - // Right shift amount - const int32x4_t vn = vdupq_n_s32(-n); - // Left shift amount - const int32x4_t vnl = vdupq_n_s32(n); - // Calculate conversion bit - const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high); - const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low); - const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31); - const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31); - const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high); - const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); - const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); - const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); - if(is_sat) - { - tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); - tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); - } - else - { - tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); - tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); - } - } - - if(is_sat) - { - return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); - } - else - { - return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high)); - } -} - -template -inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) -{ - const int16x8x2_t result = - { - { - // First 8 elements - mul_S16_S16_S16_n_loop(src1.val[0], src2.val[0], n), - // Second 8 elements - mul_S16_S16_S16_n_loop(src1.val[1], src2.val[1], n) - } - }; - - return result; -} - -template -void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); - - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - - tmp = static_cast(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n) -{ - const int32x2_t input1_1 = vget_low_s32(src1); - const int32x2_t input2_1 = vget_low_s32(src2); - const int32x2_t input1_2 = vget_high_s32(src1); - const int32x2_t input2_2 = vget_high_s32(src2); - - int64x2_t tmp_1 = vmull_s32(input1_1, input2_1); - int64x2_t tmp_2 = vmull_s32(input1_2, input2_2); - - // Apply scaling, conversion and rounding (round to zero) - // Right shift amount - const int64x2_t vn = vdupq_n_s64(-n); - // Left shift amount - const int64x2_t vnl = vdupq_n_s64(n); - // Calculate conversion bit - const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1); - const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63); - const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1); - const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s); - - const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2); - const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); - const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); - const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); - if(is_sat) - { - tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); - tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); - return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2)); - } - else - { - tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn); - tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn); - return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2)); - } -} - -template -inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) -{ - const int32x4x2_t result = - { - { - // First 4 elements - mul_S32_S32_S32_n_loop(src1.val[0], src2.val[0], n), - // Second 4 elements - mul_S32_S32_S32_n_loop(src1.val[1], src2.val[1], n) - } - }; - - return result; -} - -template -void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const int window_step_x = 8; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const int32_t broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x2_t broadcast_v = - { - { - broadcast_value_vec, - broadcast_value_vec, - } - }; - const int32x4x2_t non_broadcast_v = - { - { - vld1q_s32(non_broadcast_input_ptr + x), - vld1q_s32(non_broadcast_input_ptr + x + 4), - } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k(broadcast_v, non_broadcast_v, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast(broadcast_value) * static_cast(*(non_broadcast_input_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - if(is_sat) - { - tmp = utility::clamp(tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x2_t ta1 = - { - { - vld1q_s32(input1_ptr + x), - vld1q_s32(input1_ptr + x + 4), - } - }; - const int32x4x2_t ta2 = - { - { - vld1q_s32(input2_ptr + x), - vld1q_s32(input2_ptr + x + 4), - } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k(ta1, ta2, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - if(is_sat) - { - tmp = utility::clamp(tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); - } -} - -void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 16 / sizeof(float); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto ta1 = wrapper::vloadq(input1_ptr + x); - const auto ta2 = wrapper::vloadq(input2_ptr + x); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); - } -} - -void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - constexpr int window_step_x = 8 / sizeof(float); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - - using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - const float broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); - float32x4_t b = vdupq_n_f32(broadcast_value); - - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - - float32x4_t res = wrapper::vmul(tmp0, b); - b = wrapper::vmul(b, mask); - - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); - const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); - auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); - auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); - float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); - - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - - float32x4_t res = wrapper::vmul(tmp0, b); - - b = wrapper::vrev64(b); - b = wrapper::vmul(b, mask); - - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto a0 = *(input1_ptr + 2 * x); - const auto a1 = *(input1_ptr + 2 * x + 1); - const auto b0 = *(input2_ptr + 2 * x); - const auto b1 = *(input2_ptr + 2 * x + 1); - auto res1 = a0 * b0 - a1 * b1; - auto res2 = a0 * b1 + a1 * b0; - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - input1, input2, dst); - } -} - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) -{ - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - constexpr int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator dst(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - const auto broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const float16x8x2_t broadcast_value_vec = - { - { - vdupq_n_f16(broadcast_value), - vdupq_n_f16(broadcast_value), - } - }; - const auto scale_vec = vdupq_n_f16(scale); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t non_broadcast_v = - { - { - vld1q_f16(non_broadcast_input_ptr + x), - vld1q_f16(non_broadcast_input_ptr + x + 8), - } - }; - const float16x8x2_t result = - { - { - vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), - vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); - } - else - { - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t ta1 = - { - { - vld1q_f16(input1_ptr + x), - vld1q_f16(input1_ptr + x + 8), - } - }; - const float16x8x2_t ta2 = - { - { - vld1q_f16(input2_ptr + x), - vld1q_f16(input2_ptr + x + 8), - } - }; - const float16x8_t scale_vec = vdupq_n_f16(scale); - const float16x8x2_t result = - { - { - vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), - vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); - } -} -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - -template -void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16 / sizeof(uint8_t); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); - const uint8x16_t av = wrapper::vloadq(input1_ptr + x); - - uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); - uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); - tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); - tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); - - if(is_scale255) - { - tmp_low = scale255_U16_U16(tmp_low); - tmp_high = scale255_U16_U16(tmp_high); - } - else - { - const int16x8_t vn = vdupq_n_s16(-n); - - if(is_sat) - { - tmp_low = vqshlq_u16(tmp_low, vn); - tmp_high = vqshlq_u16(tmp_high, vn); - } - else - { - tmp_low = vshlq_u16(tmp_low, vn); - tmp_high = vshlq_u16(tmp_high, vn); - } - } - - if(is_sat) - { - static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); - - tmp_low = vminq_u16(tmp_low, max); - tmp_high = vminq_u16(tmp_high, max); - } - - vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); - vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - tmp = static_cast(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } - - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; - } - - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Create input windows - Window win = window; - Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src1, input1_win); - Iterator input2(src2, input2_win); - Iterator dst(out, win); - - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const uint8x8x2_t ta2u = - { - { - vld1_u8(input2_ptr + x), - vld1_u8(input2_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) - } - }; - - const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); - - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast(*(input1_ptr + x)) * static_cast(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast(tmp) * scale255_constant; - - tmp = static_cast(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast(mask)) >> n; - } - } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast(tmp); - } - }, - input1, input2, dst); -} - -template -void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) -{ - // Simply swap the two input buffers - mul_S16_U8_S16(src2, src1, out, window, n); -} -} // namespace - -void CpuPixelWiseMultiplicationKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_UNUSED(rounding_policy); - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - // Auto initialize dst if not initialized - set_shape_if_empty(*dst, out_shape); - - _scale = scale; - _scale_exponent = 0; - _func_quantized = nullptr; - _func_int = nullptr; - _func_float = nullptr; - - bool is_scale_255 = false; - // Check and validate scaling factor - if(std::abs(scale - scale255_constant) < 0.00001f) - { - is_scale_255 = true; - } - else - { - int exponent = 0; - - std::frexp(scale, &exponent); - - // Store the positive exponent. We know that we compute 1/2^n - // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 - _scale_exponent = std::abs(exponent - 1); - } - - const DataType dt_input1 = src1->data_type(); - const DataType dt_input2 = src2->data_type(); - const DataType dt_output = dst->data_type(); - const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); - - switch(dt_input1) - { - case DataType::QASYMM8: - if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) - { - _func_quantized = &mul_saturate_quantized_8; - } - break; - case DataType::QASYMM8_SIGNED: - if(dt_input2 == DataType::QASYMM8_SIGNED) - { - _func_quantized = &mul_saturate_quantized_8; - ; - } - break; - case DataType::QSYMM16: - if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) - { - _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; - } - else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) - { - _func_int = &mul_QSYMM16_QSYMM16_S32; - } - break; - case DataType::S16: - if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; - } - else - { - _func_int = is_sat ? &mul_S16_U8_S16 : &mul_S16_U8_S16; - } - } - if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; - } - else - { - _func_int = is_sat ? &mul_S16_S16_S16 : &mul_S16_S16_S16; - } - } - break; - case DataType::S32: - if(DataType::S32 == dt_input2 && DataType::S32 == dt_output) - { - _func_int = is_sat ? &mul_S32_S32_S32 : &mul_S32_S32_S32; - } - break; - case DataType::U8: - if(DataType::U8 == dt_input2 && DataType::U8 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; - } - else - { - _func_int = is_sat ? &mul_U8_U8_U8 : &mul_U8_U8_U8; - } - } - else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; - } - else - { - _func_int = is_sat ? &mul_U8_U8_S16 : &mul_U8_U8_S16; - } - } - else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) - { - if(is_scale_255) - { - _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; - } - else - { - _func_int = is_sat ? &mul_U8_S16_S16 : &mul_U8_S16_S16; - } - } - break; -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - _func_float = &mul_F16_F16_F16; - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::F32: - _func_float = &mul_F32_F32_F32; - break; - default: - ARM_COMPUTE_ERROR("You called with the wrong img formats"); - } - - // Configure kernel window - Window win = calculate_max_window(out_shape); - - ICpuKernel::configure(win); -} - -Status CpuPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); - - return Status{}; -} - -void CpuPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_func_quantized != nullptr) - { - (*_func_quantized)(src1, src2, dst, window, _scale); - } - else if(_func_int != nullptr) - { - (*_func_int)(src1, src2, dst, window, _scale_exponent); - } - else - { - ARM_COMPUTE_ERROR_ON(_func_float == nullptr); - (*_func_float)(src1, src2, dst, window, _scale); - } -} -const char *CpuPixelWiseMultiplicationKernel::name() const -{ - return "CpuPixelWiseMultiplicationKernel"; -} -namespace -{ -Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - } - - return Status{}; -} -} // namespace - -void CpuComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst)); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - // Auto initialize dst if not initialized - const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type()); - auto_init_if_empty(*dst, out_info); - - // Configure kernel window - Window win = calculate_max_window(out_shape); - - ICpuKernel::configure(win); -} - -Status CpuComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst)); - - return Status{}; -} - -void CpuComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); - - auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - c_mul_F32_F32_F32_n(src1, src2, dst, window); -} - -const char *CpuComplexPixelWiseMultiplicationKernel::name() const -{ - return "CpuComplexPixelWiseMultiplicationKernel"; -} -} // namespace kernels -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h b/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h deleted file mode 100644 index 567f08d06e..0000000000 --- a/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PIXELWISE_MULTIPLICATION_KERNEL_H -#define ARM_COMPUTE_CPU_PIXELWISE_MULTIPLICATION_KERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/cpu/ICpuKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -/** Interface for the kernel to perform addition between two tensors */ -class CpuPixelWiseMultiplicationKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuPixelWiseMultiplicationKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPixelWiseMultiplicationKernel); - /** Initialise the kernel's input, dst and border mode. - * - * Valid configurations (Src1,Src2) -> Dst : - * - * Support: Broadcast? Scale=1/255? - * - (U8,U8) -> U8, S16 N Y - * - (U8,S16) -> S16 N Y - * - (S16,U8) -> S16 N Y - * - (S16,S16) -> S16 N Y - * - (S32,S32) -> S32 Y N - * - (F16,F16) -> F16 N Y - * - (F32,F32) -> F32 Y Y - * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y - * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - /** Static function to check if given info will lead to a valid configuration of @ref CpuPixelWiseMultiplicationKernel - * - * Valid configurations (Src1,Src2) -> Dst : - * Support: Broadcast? Scale=1/255? - * - (U8,U8) -> U8, S16 N Y - * - (U8,S16) -> S16 N Y - * - (S16,U8) -> S16 N Y - * - (S16,S16) -> S16 N Y - * - (S32,S32) -> S32 Y N - * - (F16,F16) -> F16 N Y - * - (F32,F32) -> F32 Y Y - * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y - * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in] src1 First src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] src2 Second src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] dst Dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the srcs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); - - // Inherited methods overridden - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; - -private: - /** Common signature for all the specialised multiplication functions with integer scaling factor - * - * @param[in] src1 Src1 tensor object. - * @param[in] src2 Src2 tensor object. - * @param[out] dst Dst tensor object. - * @param[in] window Region on which to execute the kernel - * @param[in] scale Integer scale factor. - */ - using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); - /** Common signature for all the specialised multiplication functions with float scaling factor - * - * @param[in] src1 Src1 tensor object. - * @param[in] src2 Src2 tensor object. - * @param[out] dst Dst tensor object. - * @param[in] window Region on which to execute the kernel - * @param[in] scale Float scale factor. - */ - using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); - /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor - * - * @param[in] src1 Src1 tensor object. - * @param[in] src2 Src2 tensor object. - * @param[out] dst Dst tensor object. - * @param[in] window Region on which to execute the kernel - * @param[in] scale Float scale factor. - * - */ - using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); - - MulFunctionFloat *_func_float{ nullptr }; - MulFunctionInt *_func_int{ nullptr }; - MulFunctionQuantized *_func_quantized{ nullptr }; - float _scale{ 0 }; - int _scale_exponent{ 0 }; -}; - -/** Interface for the complex pixelwise multiplication kernel. */ -class CpuComplexPixelWiseMultiplicationKernel : public ICpuKernel -{ -public: - /** Default constructor */ - CpuComplexPixelWiseMultiplicationKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexPixelWiseMultiplicationKernel); - /** Initialise the kernel's src, dst and border mode. - * - * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration of @ref CpuComplexPixelWiseMultiplicationKernel - * - * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; - const char *name() const override; -}; -} // namespace kernels -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_PIXELWISE_MULTIPLICATION_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClMulKernel.cpp b/src/core/gpu/cl/kernels/ClMulKernel.cpp new file mode 100644 index 0000000000..837324ede2 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClMulKernel.cpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClMulKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, + 1, + DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, + 1, + DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, + 1, + DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), + "Dst can only be U8 if both src are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), + "Dst can only be QASYMM8 if both src are QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), + "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), + "Dst can only be QSYMM16 if both src are QSYMM16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::S32 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), + "Dst can only be S32 if both src are QSYMM16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + } + + return Status{}; +} +} // namespace + +void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, + scale, overflow_policy, rounding_policy, act_info)); + + auto padding_info = get_padding_info({ src1, src2, dst }); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); + + int scale_int = -1; + // Extract sign, exponent and mantissa + int exponent = 0; + float normalized_mantissa = std::frexp(scale, &exponent); + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + { + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + scale_int = std::abs(exponent - 1); + } + + std::string acc_type; + // Check if it has float src and dst + if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) + { + scale_int = -1; + acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half"; + } + else + { + if(src1->element_size() == 2 || src2->element_size() == 2) + { + // Use 32-bit accumulator for 16-bit input + acc_type = "int"; + } + else + { + // Use 16-bit accumulator for 8-bit input + acc_type = "ushort"; + } + } + + const bool is_quantized = is_data_type_quantized(src1->data_type()); + const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0)); + const unsigned int vec_size_leftover = dst->dimension(0) % vec_size; + + // Set kernel build options + std::string kernel_name = "pixelwise_mul"; + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type())); + build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); + if(is_quantized && (dst->data_type() != DataType::S32)) + { + const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()), + "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()), + "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()), + "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + kernel_name += "_quantized"; + } + else + { + kernel_name += (scale_int >= 0) ? "_int" : "_float"; + build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE"); + build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte"); + build_opts.add_option("-DACC_DATA_TYPE=" + acc_type); + if(act_info.enabled()) + { + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + } + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set scale argument + unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + + if(scale_int >= 0 && !is_quantized) + { + _kernel.setArg(idx++, scale_int); + } + else + { + _kernel.setArg(idx++, scale); + } + + Window win = calculate_max_window(*dst, Steps(vec_size)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(dst->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); +} + +Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); + + return Status{}; +} + +void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); + const TensorShape &in_shape2 = src_1->info()->tensor_shape(); + const TensorShape &out_shape = dst->info()->tensor_shape(); + + bool can_collapse = true; + if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + + const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src_0, slice_input1); + add_3D_tensor_argument(idx, src_1, slice_input2); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); + } + while(collapsed.slide_window_slice_3D(slice)); +} + +namespace +{ +constexpr unsigned int vec_size_complex = 1; + +Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + } + + return Status{}; +} +} // namespace + +void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info)); + + auto padding_info = get_padding_info({ src1, src2, dst }); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + if(act_info.enabled()) + { + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + } + + // Create kernel + _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options()); + + Window win = calculate_max_window(*dst, Steps(vec_size_complex)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info)); + + return Status{}; +} + +void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); + const TensorShape &in_shape2 = src_1->info()->tensor_shape(); + const TensorShape &out_shape = dst->info()->tensor_shape(); + + bool can_collapse = true; + if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + + const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src_0, slice_input1); + add_3D_tensor_argument(idx, src_1, slice_input2); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClMulKernel.h b/src/core/gpu/cl/kernels/ClMulKernel.h new file mode 100644 index 0000000000..e2e54a836e --- /dev/null +++ b/src/core/gpu/cl/kernels/ClMulKernel.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H +#define ARM_COMPUTE_CL_MUL_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the pixelwise multiplication kernel. */ +class ClMulKernel : public IClKernel +{ +public: + /** Default constructor */ + ClMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel); + /** Initialise the kernel's src and dst. + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,U8) -> S16 + * - (S16,S16) -> S16 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * - (QSYMM16,QSYMM16) -> S32 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; + +/** Interface for the complex pixelwise multiplication kernel. */ +class ClComplexMulKernel : public ICLKernel +{ +public: + /** Default constructor */ + ClComplexMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel); + /** Initialise the kernel's src and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2. + * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClComplexMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp deleted file mode 100644 index 5aed824d2c..0000000000 --- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.cpp +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/OpenCL.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/CL/CLValidate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(overflow_policy); - ARM_COMPUTE_UNUSED(rounding_policy); - - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), - "Dst can only be U8 if both src are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), - "Dst can only be QASYMM8 if both src are QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), - "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), - "Dst can only be QSYMM16 if both src are QSYMM16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::S32 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), - "Dst can only be S32 if both src are QSYMM16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - } - - return Status{}; -} -} // namespace - -void ClPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, - scale, overflow_policy, rounding_policy, act_info)); - - auto padding_info = get_padding_info({ src1, src2, dst }); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); - - int scale_int = -1; - // Extract sign, exponent and mantissa - int exponent = 0; - float normalized_mantissa = std::frexp(scale, &exponent); - // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 - // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 - // Moreover, it will be negative as we deal with 1/2^n - if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) - { - // Store the positive exponent. We know that we compute 1/2^n - // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 - scale_int = std::abs(exponent - 1); - } - - std::string acc_type; - // Check if it has float src and dst - if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) - { - scale_int = -1; - acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half"; - } - else - { - if(src1->element_size() == 2 || src2->element_size() == 2) - { - // Use 32-bit accumulator for 16-bit input - acc_type = "int"; - } - else - { - // Use 16-bit accumulator for 8-bit input - acc_type = "ushort"; - } - } - - const bool is_quantized = is_data_type_quantized(src1->data_type()); - const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0)); - const unsigned int vec_size_leftover = dst->dimension(0) % vec_size; - - // Set kernel build options - std::string kernel_name = "pixelwise_mul"; - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type())); - build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type())); - build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); - build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); - build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); - if(is_quantized && (dst->data_type() != DataType::S32)) - { - const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); - const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); - const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - - build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()), - "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset)); - build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()), - "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset)); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()), - "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset)); - build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); - build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); - build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); - kernel_name += "_quantized"; - } - else - { - kernel_name += (scale_int >= 0) ? "_int" : "_float"; - build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE"); - build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte"); - build_opts.add_option("-DACC_DATA_TYPE=" + acc_type); - if(act_info.enabled()) - { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); - build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - } - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set scale argument - unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - - if(scale_int >= 0 && !is_quantized) - { - _kernel.setArg(idx++, scale_int); - } - else - { - _kernel.setArg(idx++, scale); - } - - Window win = calculate_max_window(*dst, Steps(vec_size)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(dst->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src1->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(src2->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(0)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(dst->dimension(2)); -} - -Status ClPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); - - return Status{}; -} - -void ClPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - const TensorShape &in_shape1 = src_0->info()->tensor_shape(); - const TensorShape &in_shape2 = src_1->info()->tensor_shape(); - const TensorShape &out_shape = dst->info()->tensor_shape(); - - bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; - - const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_input1); - add_3D_tensor_argument(idx, src_1, slice_input2); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); -} - -namespace -{ -constexpr unsigned int vec_size_complex = 1; - -Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); - - // Validate in case of configured dst - if(dst->total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - } - - return Status{}; -} -} // namespace - -void ClComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info)); - - auto padding_info = get_padding_info({ src1, src2, dst }); - - const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); - - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - if(act_info.enabled()) - { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); - build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - } - - // Create kernel - _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options()); - - Window win = calculate_max_window(*dst, Steps(vec_size_complex)); - ICLKernel::configure_internal(win); - - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); -} - -Status ClComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info)); - - return Status{}; -} - -void ClComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const auto src_0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - - const TensorShape &in_shape1 = src_0->info()->tensor_shape(); - const TensorShape &in_shape2 = src_1->info()->tensor_shape(); - const TensorShape &out_shape = dst->info()->tensor_shape(); - - bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) - { - can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) - { - can_collapse = (in_shape1[d] == in_shape2[d]); - } - } - - bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; - - const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; - const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; - - Window slice = collapsed.first_slice_window_3D(); - Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); - Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, src_0, slice_input1); - add_3D_tensor_argument(idx, src_1, slice_input2); - add_3D_tensor_argument(idx, dst, slice); - enqueue(queue, *this, slice, lws_hint()); - - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); - ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); -} -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h b/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h deleted file mode 100644 index 5b827262a1..0000000000 --- a/src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H -#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H - -#include "src/core/common/Macros.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -/** Interface for the pixelwise multiplication kernel. */ -class ClPixelWiseMultiplicationKernel : public IClKernel -{ -public: - /** Default constructor */ - ClPixelWiseMultiplicationKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPixelWiseMultiplicationKernel); - /** Initialise the kernel's src and dst. - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,U8) -> S16 - * - (S16,S16) -> S16 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - (QSYMM16,QSYMM16) -> S32 - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClPixelWiseMultiplicationKernel - * - * Valid configurations (Input1,Input2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,U8) -> S16 - * - (S16,S16) -> S16 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - (QSYMM16,QSYMM16) -> S32 - * - * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; - -/** Interface for the complex pixelwise multiplication kernel. */ -class ClComplexPixelWiseMultiplicationKernel : public ICLKernel -{ -public: - /** Default constructor */ - ClComplexPixelWiseMultiplicationKernel() = default; - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexPixelWiseMultiplicationKernel); - /** Initialise the kernel's src and dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2. - * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClComplexPixelWiseMultiplicationKernel - * - * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2. - * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */ diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp index efebf2b84c..932659268d 100644 --- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp +++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp @@ -26,7 +26,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/ICLKernel.h" -#include "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h" +#include "src/runtime/gpu/cl/operators/ClMul.h" #include @@ -34,10 +34,10 @@ namespace arm_compute { struct CLPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{ nullptr }; + const ICLTensor *src_1{ nullptr }; + ICLTensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; CLPixelWiseMultiplication::CLPixelWiseMultiplication() @@ -60,14 +60,14 @@ void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_contex _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); } Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { - return opencl::ClPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } void CLPixelWiseMultiplication::run() @@ -82,10 +82,10 @@ void CLPixelWiseMultiplication::run() struct CLComplexPixelWiseMultiplication::Impl { - const ICLTensor *src_0{ nullptr }; - const ICLTensor *src_1{ nullptr }; - ICLTensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ICLTensor *src_0{ nullptr }; + const ICLTensor *src_1{ nullptr }; + ICLTensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() @@ -106,13 +106,13 @@ void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info); } Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) { - return opencl::ClComplexPixelWiseMultiplication::validate(input1, input2, output, act_info); + return opencl::ClComplexMul::validate(input1, input2, output, act_info); } void CLComplexPixelWiseMultiplication::run() diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index 4d7fef89ed..3a2f1984b4 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -24,7 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/core/ITensor.h" -#include "src/runtime/cpu/operators/CpuPixelWiseMultiplication.h" +#include "src/runtime/cpu/operators/CpuMul.h" #include @@ -32,10 +32,10 @@ namespace arm_compute { struct NEPixelWiseMultiplication::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{ nullptr }; + const ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; NEPixelWiseMultiplication::NEPixelWiseMultiplication() @@ -47,7 +47,7 @@ NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { - return cpu::CpuPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); + return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, @@ -56,7 +56,7 @@ void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor * _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); } @@ -71,10 +71,10 @@ void NEPixelWiseMultiplication::run() struct NEComplexPixelWiseMultiplication::Impl { - ITensor *src_0{ nullptr }; - ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src_0{ nullptr }; + ITensor *src_1{ nullptr }; + ITensor *dst{ nullptr }; + std::unique_ptr op{ nullptr }; }; NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() @@ -85,7 +85,7 @@ NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) { - return cpu::CpuComplexPixelWiseMultiplication::validate(input1, input2, output, act_info); + return cpu::CpuComplexMul::validate(input1, input2, output, act_info); } void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) @@ -93,7 +93,7 @@ void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; - _impl->op = std::make_unique(); + _impl->op = std::make_unique(); _impl->op->configure(input1->info(), input2->info(), output->info(), act_info); } diff --git a/src/runtime/cpu/operators/CpuMul.cpp b/src/runtime/cpu/operators/CpuMul.cpp new file mode 100644 index 0000000000..2f3d442a70 --- /dev/null +++ b/src/runtime/cpu/operators/CpuMul.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/cpu/operators/CpuMul.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/core/cpu/kernels/CpuMulKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); +} + +void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + auto k = std::make_unique(); + k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy); + _kernel = std::move(k); +} + +void CpuMul::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} + +Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuComplexMulKernel::validate(src1, src2, dst); +} + +void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + auto k = std::make_unique(); + k->configure(src1, src2, dst); + _kernel = std::move(k); +} + +void CpuComplexMul::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuMul.h b/src/runtime/cpu/operators/CpuMul.h new file mode 100644 index 0000000000..6e717188a4 --- /dev/null +++ b/src/runtime/cpu/operators/CpuMul.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MUL_H +#define ARM_COMPUTE_CPU_MUL_H + +#include "arm_compute/core/ITensorInfo.h" +#include "src/runtime/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuMulKernel */ +class CpuMul : public ICpuOperator +{ +public: + /** Default Constructor */ + CpuMul() = default; + /** Initialise the kernel's inputs, dst and convertion policy. + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32). + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst dst tensor info. Data types supported: + * - U8, only if both inputs are U8. + * - QASYMM8, only if both inputs are QASYMM8. + * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED. + * - S16. + * - QSYMM16, only if both inputs are QSYMM16. + * - S32, only if both inputs are S32 or both are QSYMM16. + * - F16, only if @p src1 is F16. + * - F32, only if both inputs are F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype + * @param[in] rounding_policy Rounding policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; + +/** Basic function to run @ref kernels::CpuComplexMulKernel */ +class CpuComplexMul : public ICpuOperator +{ +public: + /** Default Constructor */ + CpuComplexMul() = default; + /** Initialise the kernel's inputs, dst. + * + * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuComplexMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MUL_H */ \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp b/src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp deleted file mode 100644 index 2e560d7490..0000000000 --- a/src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuPixelWiseMultiplication.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -Status CpuPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy); -} - -void CpuPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(input1, input2, output, scale, overflow_policy, rounding_policy); - _kernel = std::move(k); -} - -void CpuPixelWiseMultiplication::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} - -Status CpuComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuComplexPixelWiseMultiplicationKernel::validate(input1, input2, output); -} - -void CpuComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(input1, input2, output); - _kernel = std::move(k); -} - -void CpuComplexPixelWiseMultiplication::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuPixelWiseMultiplication.h b/src/runtime/cpu/operators/CpuPixelWiseMultiplication.h deleted file mode 100644 index b2cd7d529b..0000000000 --- a/src/runtime/cpu/operators/CpuPixelWiseMultiplication.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PIXELWISEMULTIPLICATION_H -#define ARM_COMPUTE_CPU_PIXELWISEMULTIPLICATION_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuPixelWiseMultiplicationKernel */ -class CpuPixelWiseMultiplication : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuPixelWiseMultiplication() = default; - /** Initialise the kernel's inputs, output and convertion policy. - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in, out] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output Output tensor info. Data types supported: - * - U8, only if both inputs are U8. - * - QASYMM8, only if both inputs are QASYMM8. - * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED. - * - S16. - * - QSYMM16, only if both inputs are QSYMM16. - * - S32, only if both inputs are S32 or both are QSYMM16. - * - F16, only if @p input1 is F16. - * - F32, only if both inputs are F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuPixelWiseMultiplication - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] input2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). - * @param[in] output Output tensor info. Data types supported: - * - U8, only if both inputs are U8. - * - QASYMM8, only if both inputs are QASYMM8. - * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED. - * - S16. - * - QSYMM16, only if both inputs are QSYMM16. - * - S32, only if both inputs are S32 or both are QSYMM16. - * - F16, only if @p input1 is F16. - * - F32, only if both inputs are F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; - -/** Basic function to run @ref kernels::CpuComplexPixelWiseMultiplicationKernel. */ -class CpuComplexPixelWiseMultiplication : public ICpuOperator -{ -public: - /** Default Constructor */ - CpuComplexPixelWiseMultiplication() = default; - /** Initialise the kernel's inputs, output. - * - * @param[in, out] input1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 Second input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1. - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuComplexPixelWiseMultiplication - * - * @param[in] input1 First input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * @param[in] input2 Second input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. - * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_PIXELWISEMULTIPLICATION_H */ \ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClMul.cpp b/src/runtime/gpu/cl/operators/ClMul.cpp new file mode 100644 index 0000000000..d1e2bc806f --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClMul.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/gpu/cl/operators/ClMul.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/kernels/ClMulKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique(); + k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); + _kernel = std::move(k); +} + +Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); +} + +void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique(); + k->configure(compile_context, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); +} +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClMul.h b/src/runtime/gpu/cl/operators/ClMul.h new file mode 100644 index 0000000000..4a662b3276 --- /dev/null +++ b/src/runtime/gpu/cl/operators/ClMul.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_MUL_H +#define ARM_COMPUTE_CL_MUL_H + +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/runtime/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref opencl::kernels::ClMulKernel */ +class ClMul : public IClOperator +{ +public: + /** Default Constructor */ + ClMul() = default; + /** Initialise the kernel's sources, dst and convertion policy. + * + * Valid configurations (src1,src2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,U8) -> S16 + * - (S16,S16) -> S16 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * - (QSYMM16,QSYMM16) -> S32 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ +class ClComplexMul : public IClOperator +{ +public: + /** Default Constructor */ + ClComplexMul() = default; + /** Initialise the kernel's sources, dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClComplexMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_MUL_H */ \ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp deleted file mode 100644 index 137a0de6a7..0000000000 --- a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/core/gpu/cl/kernels/ClPixelWiseMultiplicationKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -void ClPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique(); - k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); - _kernel = std::move(k); -} - -Status ClPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) -{ - return kernels::ClPixelWiseMultiplicationKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); -} - -void ClComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - auto k = std::make_unique(); - k->configure(compile_context, src1, src2, dst, act_info); - _kernel = std::move(k); -} - -Status ClComplexPixelWiseMultiplication::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - return kernels::ClComplexPixelWiseMultiplicationKernel::validate(src1, src2, dst, act_info); -} -} // namespace opencl -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h b/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h deleted file mode 100644 index e1598cb870..0000000000 --- a/src/runtime/gpu/cl/operators/ClPixelWiseMultiplication.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL_PIXELWISE_MULTIPLICATION_H -#define ARM_COMPUTE_CL_PIXELWISE_MULTIPLICATION_H - -#include "src/core/gpu/cl/ClCompileContext.h" -#include "src/runtime/gpu/cl/IClOperator.h" - -#include - -namespace arm_compute -{ -namespace opencl -{ -/** Basic function to run @ref opencl::ClPixelWiseMultiplication. */ -class ClPixelWiseMultiplication : public IClOperator -{ -public: - /** Default Constructor */ - ClPixelWiseMultiplication() = default; - /** Initialise the kernel's sources, dst and convertion policy. - * - * Valid configurations (src1,src2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,U8) -> S16 - * - (S16,S16) -> S16 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - (QSYMM16,QSYMM16) -> S32 - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClPixelWiseMultiplication - * - * Valid configurations (src1,src2) -> Output : - * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,U8) -> S16 - * - (S16,S16) -> S16 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - (QSYMM16,QSYMM16) -> S32 - * - * - * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate - * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref opencl::ClComplexPixelWiseMultiplication. */ -class ClComplexPixelWiseMultiplication : public IClOperator -{ -public: - /** Default Constructor */ - ClComplexPixelWiseMultiplication() = default; - /** Initialise the kernel's sources, dst. - * - * @param[in] compile_context The compile context to be used. - * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref ClComplexPixelWiseMultiplication - * - * @param[in] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2. - * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace opencl -} // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PIXELWISE_MULTIPLICATION_H */ \ No newline at end of file -- cgit v1.2.1