aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSheri Zhang <sheri.zhang@arm.com>2021-03-16 17:35:08 +0000
committerManuel Bottini <manuel.bottini@arm.com>2021-03-26 16:36:02 +0000
commit1e3ab4264fb0455abe8a3903abab40c59b9be91e (patch)
treeb4dd79bd7a93bedf7c6ec274c5f586f7fea0e9aa
parente81825bf68ebfce21f6839fa59ddb7e22884a206 (diff)
downloadComputeLibrary-1e3ab4264fb0455abe8a3903abab40c59b9be91e.tar.gz
Make CpuPixelWiseMultiplicationKernel stateless
Resolves: COMPMID-4183 Signed-off-by: Sheri Zhang <sheri.zhang@arm.com> Change-Id: Ie535c4129a6164b879fb5c4acb15f2be58ee8b6c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5325 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp3
-rw-r--r--arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h118
-rw-r--r--docs/00_introduction.dox10
-rw-r--r--src/core/NEON/NEKernels.h1
-rw-r--r--src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp (renamed from src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp)463
-rw-r--r--src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h (renamed from src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h)137
-rw-r--r--src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp68
-rw-r--r--src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp77
-rw-r--r--src/runtime/cpu/operators/CpuPixelWiseMultiplication.h133
9 files changed, 541 insertions, 469 deletions
diff --git a/Android.bp b/Android.bp
index 8560de3e89..dcf4b802fa 100644
--- a/Android.bp
+++ b/Android.bp
@@ -220,7 +220,6 @@ cc_library_static {
"src/core/NEON/kernels/NEMinMaxLayerKernel.cpp",
"src/core/NEON/kernels/NENormalizationLayerKernel.cpp",
"src/core/NEON/kernels/NEPadLayerKernel.cpp",
- "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp",
"src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
"src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
"src/core/NEON/kernels/NEROIAlignLayerKernel.cpp",
@@ -322,6 +321,7 @@ cc_library_static {
"src/core/cpu/kernels/CpuFillKernel.cpp",
"src/core/cpu/kernels/CpuFloorKernel.cpp",
"src/core/cpu/kernels/CpuPermuteKernel.cpp",
+ "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp",
"src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
"src/core/cpu/kernels/CpuPoolingKernel.cpp",
"src/core/cpu/kernels/CpuQuantizationKernel.cpp",
@@ -661,6 +661,7 @@ cc_library_static {
"src/runtime/cpu/operators/CpuFill.cpp",
"src/runtime/cpu/operators/CpuFloor.cpp",
"src/runtime/cpu/operators/CpuPermute.cpp",
+ "src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp",
"src/runtime/cpu/operators/CpuPooling.cpp",
"src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp",
"src/runtime/cpu/operators/CpuQuantization.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 91cf44ff2e..6f4cce3cde 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,113 +26,15 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/INEOperator.h"
+
+#include <memory>
namespace arm_compute
{
class ITensor;
class ITensorInfo;
-namespace experimental
-{
-/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
-class NEPixelWiseMultiplication : public INEOperator
-{
-public:
- /** Initialise the kernel's inputs, output and convertion policy.
- *
- * Valid configurations (Input1,Input2) -> Output :
- *
- * Support: Broadcast? Scale=1/255?
- * - (U8,U8) -> U8, S16 N Y
- * - (U8,S16) -> S16 N Y
- * - (S16,U8) -> S16 N Y
- * - (S16,S16) -> S16 N Y
- * - (S32,S32) -> S32 Y N
- * - (F16,F16) -> F16 N Y
- * - (F32,F32) -> F32 Y Y
- * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y
- * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y
- *
- * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
- * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
- *
- * @param[in, out] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
- * @param[in] rounding_policy Rounding policy.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- */
- void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
- *
- * Valid configurations (Input1,Input2) -> Output :
- *
- * Support: Broadcast? Scale=1/255?
- * - (U8,U8) -> U8, S16 N Y
- * - (U8,S16) -> S16 N Y
- * - (S16,U8) -> S16 N Y
- * - (S16,S16) -> S16 N Y
- * - (S32,S32) -> S32 Y N
- * - (F16,F16) -> F16 N Y
- * - (F32,F32) -> F32 Y Y
- * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y
- * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y
- *
- * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
- * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
- *
- * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
- * @param[in] rounding_policy Rounding policy.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-
-/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
-class NEComplexPixelWiseMultiplication : public INEOperator
-{
-public:
- /** Initialise the kernel's inputs, output.
- *
- * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
- * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
- * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- */
- void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
- *
- * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
- * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
- */
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-} // namespace experimental
-
-/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
+/** Basic function to run @ref cpu::CpuPixelWiseMultiplication */
class NEPixelWiseMultiplication : public IFunction
{
public:
@@ -143,11 +45,11 @@ public:
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete;
/** Default move constructor */
- NEPixelWiseMultiplication(NEPixelWiseMultiplication &&);
+ NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default;
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete;
/** Default move assignment operator */
- NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&);
+ NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&) = default;
/** Initialise the kernel's inputs, output and convertion policy.
*
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
@@ -211,7 +113,7 @@ private:
std::unique_ptr<Impl> _impl;
};
-/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
+/** Basic function to run @ref cpu::CpuComplexPixelWiseMultiplication. */
class NEComplexPixelWiseMultiplication : public IFunction
{
public:
@@ -222,11 +124,11 @@ public:
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete;
/** Default move constructor */
- NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&);
+ NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default;
/** Prevent instances of this class from being copied (As this class contains pointers) */
NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete;
/** Default move assignment operator */
- NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&);
+ NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&) = default;
/** Initialise the kernel's inputs, output.
*
* @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -253,5 +155,5 @@ private:
struct Impl;
std::unique_ptr<Impl> _impl;
};
-}
+} // namespace arm_compute
#endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 913f76cf5b..ea0f9f7d43 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -232,7 +232,7 @@ v20.11 Public major release
- NEArithmeticSubtraction
- NEArithmeticSubtractionKernel
- @ref NEPixelWiseMultiplication
- - @ref NEPixelWiseMultiplicationKernel
+ - NEPixelWiseMultiplicationKernel
- NEElementwiseDivision
- NEDivisionOperationKernel
- Interface change
@@ -255,7 +255,7 @@ v20.11 Public major release
- @ref NELogicalAnd
- @ref NELogicalOr
- Removed padding from Neon kernels:
- - @ref NEComplexPixelWiseMultiplicationKernel
+ - NEComplexPixelWiseMultiplicationKernel
- NENonMaximaSuppression3x3Kernel
- @ref NERemapKernel
- @ref NEGEMMInterleave4x4Kernel
@@ -548,7 +548,7 @@ v20.08 Public major release
- Enabled tuning for export_to_cl_image_rhs option for RHS tensors
- More robust script for running benchmarks
- Removed padding from:
- - @ref NEPixelWiseMultiplicationKernel
+ - NEPixelWiseMultiplicationKernel
- NEHeightConcatenateLayerKernel
- NEThresholdKernel
- NEBatchConcatenateLayerKernel
@@ -861,7 +861,7 @@ v19.05 Public major release
- Various optimisations.
- New Neon kernels / functions:
- @ref NEBatchToSpaceLayerKernel / @ref NEBatchToSpaceLayer
- - @ref NEComplexPixelWiseMultiplicationKernel / @ref NEComplexPixelWiseMultiplication
+ - NEComplexPixelWiseMultiplicationKernel / @ref NEComplexPixelWiseMultiplication
- @ref NECropKernel / @ref NECropResize
- @ref NEDepthwiseConvolutionAssemblyDispatch
- @ref NEFFTDigitReverseKernel
@@ -1128,7 +1128,7 @@ v18.05 Public major release
- Created the validate method in @ref CLDepthwiseConvolutionLayer.
- Beta and gamma are no longer mandatory arguments in @ref NEBatchNormalizationLayer and @ref CLBatchNormalizationLayer.
- Added depth multiplier support in @ref NEDepthwiseConvolutionLayer and @ref CLDepthwiseConvolutionLayer.
- - Added broadcast multiply support in @ref NEPixelWiseMultiplication / @ref NEPixelWiseMultiplicationKernel.
+ - Added broadcast multiply support in @ref NEPixelWiseMultiplication / NEPixelWiseMultiplicationKernel.
- Port mobilenet example to NHWC data layout.
- Enabled Winograd method in @ref CLConvolutionLayer.
- Renamed NEWinogradLayer to @ref NEWinogradConvolutionLayer.
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 53e02261f1..0acaebb582 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -72,7 +72,6 @@
#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
#include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp
index b287e18281..91b7552ecf 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp
@@ -21,8 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h"
+#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "src/core/CPP/Validate.h"
#include "src/core/NEON/NEAsymm.h"
@@ -33,60 +34,60 @@
#include <arm_neon.h>
-#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h> // needed for float16_t
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
namespace arm_compute
{
+namespace cpu
+{
+namespace kernels
+{
namespace
{
const float scale255_constant = 1.f / 255.f;
const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
-inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
ARM_COMPUTE_UNUSED(overflow_policy);
ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
DataType::S16, DataType::QSYMM16,
DataType::S32, DataType::F16, DataType::F32);
- if(is_data_type_quantized(input1->data_type()) || is_data_type_quantized(input2->data_type()))
+ if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
}
- if(output->total_size() > 0)
+ if(dst->total_size() > 0)
{
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
// clang-format off
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(input1->data_type() == input2->data_type() && input2->data_type() == output->data_type()) &&
- !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
- !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16) &&
- !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
- !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) &&
- !(input1->data_type() == DataType::QSYMM16 && input2->data_type() == DataType::QSYMM16 && output->data_type() == DataType::S32)
+ !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
+ !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
+ !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
+ !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
+ !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
+ !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
, "Invalid data type combination");
// clang-format on
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S16 && output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
}
if(std::abs(scale - scale255_constant) < 0.00001f)
{
ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() == DataType::S32 && input2->data_type() == DataType::S32 && output->data_type() == DataType::S32,
- "Scale == 1/255 is not supported if input and output are of data type S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
+ "Scale == 1/255 is not supported if input and dst are of data type S32");
}
else
{
@@ -109,7 +110,7 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
* @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
*
* @param in Input vector to scale.
- * @return Scaled output rounded to nearest (round half up).
+ * @return Scaled dst rounded to nearest (round half up).
*/
inline int32x4_t scale255_S32_S32(int32x4_t in)
{
@@ -143,12 +144,12 @@ vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
}
template <typename T>
-void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
+void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
{
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -156,7 +157,7 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
const int window_step_x = 16 / sizeof(T);
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+ const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
@@ -166,8 +167,8 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
@@ -176,14 +177,14 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
Iterator broadcast_input(broadcast_tensor, broadcast_win);
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
+ Iterator dst(out, win);
using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
@@ -206,7 +207,7 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
};
- // Quantize output
+ // Quantize dst
const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
wrapper::vstore(output_ptr + x, result);
}
@@ -215,36 +216,36 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
for(; x < window_end_x; ++x)
{
// Dequantize inputs
- const T in1 = *(non_broadcast_input_ptr + x);
- const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, non_broadcast_qinfo);
+ const T src1 = *(non_broadcast_input_ptr + x);
+ const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
const float tmp_f = tmp_in1 * tmp_in2;
- // Quantize output
+ // Quantize dst
const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
*(output_ptr + x) = tmp_qua;
}
},
- broadcast_input, non_broadcast_input, output);
+ broadcast_input, non_broadcast_input, dst);
}
else
{
- const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
+ const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
+ const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
// Clear X Dimension on execution window as we handle manually
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -265,7 +266,7 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
};
- // Quantize output
+ // Quantize dst
const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
wrapper::vstore(output_ptr + x, result);
}
@@ -274,40 +275,40 @@ void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *o
for(; x < window_end_x; ++x)
{
// Dequantize inputs
- const T in1 = *(input1_ptr + x);
- const T in2 = *(input2_ptr + x);
- const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, input1_qua_info);
- const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(in2, input2_qua_info);
+ const T src1 = *(input1_ptr + x);
+ const T src2 = *(input2_ptr + x);
+ const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
+ const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
const float tmp_f = tmp_in1 * tmp_in2;
- // Quantize output
+ // Quantize dst
const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
*(output_ptr + x) = tmp_qua;
}
},
- input1, input2, output);
+ input1, input2, dst);
}
}
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
{
- const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
+ const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
+ const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -319,7 +320,7 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2
{
const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -365,32 +366,32 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2
float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
float tmp_f = tmp_in1 * tmp_in2;
- // Quantize output, lrintf() has same rounding mode as vcombine_s16
+ // Quantize dst, lrintf() has same rounding mode as vcombine_s16
int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
*(output_ptr + x) = tmp_qua;
}
},
- input1, input2, output);
+ input1, input2, dst);
}
-void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale)
+void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
{
ARM_COMPUTE_UNUSED(scale);
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -400,7 +401,7 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *ou
{
const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -463,25 +464,25 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *ou
*(output_ptr + x) = tmp;
}
},
- input1, input2, output);
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
-void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
{
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
const int window_step_x = 16 / sizeof(uint8_t);
const auto window_start_x = static_cast<int>(window.x().start());
@@ -491,7 +492,7 @@ void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Wi
{
const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -559,16 +560,16 @@ void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Wi
*(output_ptr + x) = static_cast<uint8_t>(tmp);
}
},
- input1, input2, output);
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
-inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
+inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
{
- int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
- const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
- int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));
- const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));
+ int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
+ const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
+ int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
+ const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
@@ -616,15 +617,15 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t
}
template <bool is_scale255, bool is_sat>
-inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
+inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
{
const int16x8x2_t result =
{
{
// First 8 elements
- mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
+ mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
// Second 8 elements
- mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
+ mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
}
};
@@ -632,21 +633,21 @@ inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x
}
template <bool is_scale255, bool is_sat>
-void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
{
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -656,7 +657,7 @@ void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
{
const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -712,16 +713,16 @@ void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = static_cast<int16_t>(tmp);
}
},
- input1, input2, output);
+ input1, input2, dst);
}
template <bool is_sat>
-inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &input1, const int32x4_t &input2, int n)
+inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
{
- const int32x2_t input1_1 = vget_low_s32(input1);
- const int32x2_t input2_1 = vget_low_s32(input2);
- const int32x2_t input1_2 = vget_high_s32(input1);
- const int32x2_t input2_2 = vget_high_s32(input2);
+ const int32x2_t input1_1 = vget_low_s32(src1);
+ const int32x2_t input2_1 = vget_low_s32(src2);
+ const int32x2_t input1_2 = vget_high_s32(src1);
+ const int32x2_t input2_2 = vget_high_s32(src2);
int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
@@ -756,15 +757,15 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &input1, const int32x4_t
}
template <bool is_sat>
-inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &input1, const int32x4x2_t &input2, int n)
+inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
{
const int32x4x2_t result =
{
{
// First 4 elements
- mul_S32_S32_S32_n_loop<is_sat>(input1.val[0], input2.val[0], n),
+ mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
// Second 4 elements
- mul_S32_S32_S32_n_loop<is_sat>(input1.val[1], input2.val[1], n)
+ mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
}
};
@@ -772,11 +773,11 @@ inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &input1, const int32x4x
}
template <bool is_sat>
-void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
{
// Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
Window win = window;
@@ -785,27 +786,27 @@ void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const
const int window_step_x = 8;
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+ const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
if(is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator broadcast_input(broadcast_tensor, broadcast_win);
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
@@ -855,7 +856,7 @@ void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = static_cast<int32_t>(tmp);
}
},
- broadcast_input, non_broadcast_input, output);
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -863,15 +864,15 @@ void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -918,15 +919,15 @@ void mul_S32_S32_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = static_cast<int32_t>(tmp);
}
},
- input1, input2, output);
+ input1, input2, dst);
}
}
-void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
+void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
{
// Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
Window win = window;
@@ -935,7 +936,7 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
constexpr int window_step_x = 16 / sizeof(float);
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+ const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
@@ -944,20 +945,20 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator broadcast_input(broadcast_tensor, broadcast_win);
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
@@ -979,7 +980,7 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
}
},
- broadcast_input, non_broadcast_input, output);
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -987,15 +988,15 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -1016,15 +1017,15 @@ void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = ta1 * ta2 * scale;
}
},
- input1, input2, output);
+ input1, input2, dst);
}
}
-void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
{
// Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
Window win = window;
@@ -1033,7 +1034,7 @@ void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, c
constexpr int window_step_x = 8 / sizeof(float);
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+ const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
@@ -1042,20 +1043,20 @@ void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, c
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator broadcast_input(broadcast_tensor, broadcast_win);
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
@@ -1093,7 +1094,7 @@ void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, c
*(output_ptr + 2 * x + 1) = res2;
}
},
- broadcast_input, non_broadcast_input, output);
+ broadcast_input, non_broadcast_input, dst);
}
else
{
@@ -1101,15 +1102,15 @@ void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, c
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -1149,16 +1150,16 @@ void c_mul_F32_F32_F32_n(const ITensor *in1, const ITensor *in2, ITensor *out, c
*(output_ptr + 2 * x + 1) = res2;
}
},
- input1, input2, output);
+ input1, input2, dst);
}
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
+void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
{
// Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
Window win = window;
@@ -1166,23 +1167,23 @@ void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const
constexpr int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+ const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
if(is_broadcast_across_x)
{
const bool is_broadcast_input_2 = input2_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+ const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
+ const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator broadcast_input(broadcast_tensor, broadcast_win);
Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
const float16x8x2_t broadcast_value_vec =
{
@@ -1220,20 +1221,20 @@ void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
}
},
- broadcast_input, non_broadcast_input, output);
+ broadcast_input, non_broadcast_input, dst);
}
else
{
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
execute_window_loop(win, [&](const Coordinates &)
{
const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
for(; x <= (window_end_x - window_step_x); x += window_step_x)
@@ -1271,27 +1272,27 @@ void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = ta1 * ta2 * scale;
}
},
- input1, input2, output);
+ input1, input2, dst);
}
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <bool is_scale255, bool is_sat>
-void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
{
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
const int window_step_x = 16 / sizeof(uint8_t);
const auto window_start_x = static_cast<int>(window.x().start());
@@ -1301,7 +1302,7 @@ void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const W
{
const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -1371,25 +1372,25 @@ void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const W
*(output_ptr + x) = static_cast<int16_t>(tmp);
}
},
- input1, input2, output);
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
-void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
{
// Create input windows
Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+ Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+ Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
// Clear X Dimension on execution window as we handle manually
win.set(Window::DimX, Window::Dimension(0, 1, 1));
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
+ Iterator input1(src1, input1_win);
+ Iterator input2(src2, input2_win);
+ Iterator dst(out, win);
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -1399,7 +1400,7 @@ void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
{
const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+ const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
// Compute window_step_x elements per iteration
int x = window_start_x;
@@ -1463,33 +1464,28 @@ void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const
*(output_ptr + x) = static_cast<int16_t>(tmp);
}
},
- input1, input2, output);
+ input1, input2, dst);
}
template <bool is_scale255, bool is_sat>
-void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
+void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
{
// Simply swap the two input buffers
- mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);
+ mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
}
} // namespace
-NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
- : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
-{
-}
-
-void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void CpuPixelWiseMultiplicationKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
{
ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
- // Auto initialize output if not initialized
- set_shape_if_empty(*output, out_shape);
+ // Auto initialize dst if not initialized
+ set_shape_if_empty(*dst, out_shape);
_scale = scale;
_scale_exponent = 0;
@@ -1514,9 +1510,9 @@ void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo
_scale_exponent = std::abs(exponent - 1);
}
- const DataType dt_input1 = input1->data_type();
- const DataType dt_input2 = input2->data_type();
- const DataType dt_output = output->data_type();
+ const DataType dt_input1 = src1->data_type();
+ const DataType dt_input2 = src2->data_type();
+ const DataType dt_output = dst->data_type();
const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
switch(dt_input1)
@@ -1624,99 +1620,110 @@ void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo
// Configure kernel window
Window win = calculate_max_window(out_shape);
- INEKernel::configure(win);
+ ICpuKernel::configure(win);
}
-Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
- RoundingPolicy rounding_policy)
+Status CpuPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
return Status{};
}
-void NEPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto output = tensors.get_tensor(TensorType::ACL_DST);
+ auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
if(_func_quantized != nullptr)
{
- (*_func_quantized)(input1, input2, output, window, _scale);
+ (*_func_quantized)(src1, src2, dst, window, _scale);
}
else if(_func_int != nullptr)
{
- (*_func_int)(input1, input2, output, window, _scale_exponent);
+ (*_func_int)(src1, src2, dst, window, _scale_exponent);
}
else
{
ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
- (*_func_float)(input1, input2, output, window, _scale);
+ (*_func_float)(src1, src2, dst, window, _scale);
}
}
+const char *CpuPixelWiseMultiplicationKernel::name() const
+{
+ return "CpuPixelWiseMultiplicationKernel";
+}
namespace
{
-Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- // Validate in case of configured output
- if(output->total_size() > 0)
+ // Validate in case of configured dst
+ if(dst->total_size() > 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
}
return Status{};
}
} // namespace
-void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CpuComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
- const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
+ const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
- // Auto initialize output if not initialized
- const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
- auto_init_if_empty(*output, out_info);
+ // Auto initialize dst if not initialized
+ const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
+ auto_init_if_empty(*dst, out_info);
// Configure kernel window
Window win = calculate_max_window(out_shape);
- INEKernel::configure(win);
+ ICpuKernel::configure(win);
}
-Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
return Status{};
}
-void NEComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto output = tensors.get_tensor(TensorType::ACL_DST);
+ auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
- c_mul_F32_F32_F32_n(input1, input2, output, window);
+ c_mul_F32_F32_F32_n(src1, src2, dst, window);
+}
+
+const char *CpuComplexPixelWiseMultiplicationKernel::name() const
+{
+ return "CpuComplexPixelWiseMultiplicationKernel";
}
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h
index d414168b2d..567f08d06e 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,39 +21,28 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
+#ifndef ARM_COMPUTE_CPU_PIXELWISE_MULTIPLICATION_KERNEL_H
+#define ARM_COMPUTE_CPU_PIXELWISE_MULTIPLICATION_KERNEL_H
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
namespace arm_compute
{
-class ITensor;
-
+namespace cpu
+{
+namespace kernels
+{
/** Interface for the kernel to perform addition between two tensors */
-class NEPixelWiseMultiplicationKernel : public INEKernel
+class CpuPixelWiseMultiplicationKernel : public ICpuKernel
{
public:
- const char *name() const override
- {
- return "NEPixelWiseMultiplicationKernel";
- }
/** Default constructor */
- NEPixelWiseMultiplicationKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
- /** Default destructor */
- ~NEPixelWiseMultiplicationKernel() = default;
- /** Initialise the kernel's input, output and border mode.
+ CpuPixelWiseMultiplicationKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPixelWiseMultiplicationKernel);
+ /** Initialise the kernel's input, dst and border mode.
*
- * Valid configurations (Input1,Input2) -> Output :
+ * Valid configurations (Src1,Src2) -> Dst :
*
* Support: Broadcast? Scale=1/255?
* - (U8,U8) -> U8, S16 N Y
@@ -70,19 +59,19 @@ public:
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
* @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
*/
- void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
- /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
+ void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuPixelWiseMultiplicationKernel
*
- * Valid configurations (Input1,Input2) -> Output :
+ * Valid configurations (Src1,Src2) -> Dst :
* Support: Broadcast? Scale=1/255?
* - (U8,U8) -> U8, S16 N Y
* - (U8,S16) -> S16 N Y
@@ -98,89 +87,89 @@ public:
* @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
* For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
*
- * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] src1 First src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] src2 Second src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] dst Dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
* @param[in] scale Scale to apply after multiplication.
* Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+ * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the srcs is of quantized datatype
* @param[in] rounding_policy Rounding policy.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
// Inherited methods overridden
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
private:
/** Common signature for all the specialised multiplication functions with integer scaling factor
*
- * @param[in] in1 Input1 tensor object.
- * @param[in] in2 Input2 tensor object.
- * @param[out] out Output tensor object.
+ * @param[in] src1 Src1 tensor object.
+ * @param[in] src2 Src2 tensor object.
+ * @param[out] dst Dst tensor object.
* @param[in] window Region on which to execute the kernel
* @param[in] scale Integer scale factor.
*/
- using MulFunctionInt = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale);
+ using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
/** Common signature for all the specialised multiplication functions with float scaling factor
*
- * @param[in] in1 Input1 tensor object.
- * @param[in] in2 Input2 tensor object.
- * @param[out] out Output tensor object.
+ * @param[in] src1 Src1 tensor object.
+ * @param[in] src2 Src2 tensor object.
+ * @param[out] dst Dst tensor object.
* @param[in] window Region on which to execute the kernel
* @param[in] scale Float scale factor.
*/
- using MulFunctionFloat = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale);
+ using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
/** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
*
- * @param[in] in1 Input1 tensor object.
- * @param[in] in2 Input2 tensor object.
- * @param[out] out Output tensor object.
+ * @param[in] src1 Src1 tensor object.
+ * @param[in] src2 Src2 tensor object.
+ * @param[out] dst Dst tensor object.
* @param[in] window Region on which to execute the kernel
* @param[in] scale Float scale factor.
*
*/
- using MulFunctionQuantized = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale);
-
- MulFunctionFloat *_func_float;
- MulFunctionInt *_func_int;
- MulFunctionQuantized *_func_quantized;
+ using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
-private:
- float _scale;
- int _scale_exponent;
+ MulFunctionFloat *_func_float{ nullptr };
+ MulFunctionInt *_func_int{ nullptr };
+ MulFunctionQuantized *_func_quantized{ nullptr };
+ float _scale{ 0 };
+ int _scale_exponent{ 0 };
};
/** Interface for the complex pixelwise multiplication kernel. */
-class NEComplexPixelWiseMultiplicationKernel : public INEKernel
+class CpuComplexPixelWiseMultiplicationKernel : public ICpuKernel
{
public:
- const char *name() const override
- {
- return "NEComplexPixelWiseMultiplicationKernel";
- }
- /** Initialise the kernel's input, output and border mode.
+ /** Default constructor */
+ CpuComplexPixelWiseMultiplicationKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexPixelWiseMultiplicationKernel);
+ /** Initialise the kernel's src, dst and border mode.
*
- * @param[in] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
- * @param[in] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[out] output The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
*/
- void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
+ void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuComplexPixelWiseMultiplicationKernel
*
- * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
- * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
- * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * @param[in] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
};
-
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H */
+#endif /*ARM_COMPUTE_CPU_PIXELWISE_MULTIPLICATION_KERNEL_H */
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 179bcdaf3e..4d7fef89ed 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,64 +24,30 @@
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/core/ITensor.h"
-#include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
+#include "src/runtime/cpu/operators/CpuPixelWiseMultiplication.h"
#include <utility>
namespace arm_compute
{
-namespace experimental
-{
-void NEPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- auto k = std::make_unique<NEPixelWiseMultiplicationKernel>();
- k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
- _kernel = std::move(k);
-}
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
- const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
- return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
-}
-
-void NEComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- auto k = std::make_unique<NEComplexPixelWiseMultiplicationKernel>();
- k->configure(input1, input2, output);
- _kernel = std::move(k);
-}
-
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
- return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
-}
-} // namespace experimental
-
struct NEPixelWiseMultiplication::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<experimental::NEPixelWiseMultiplication> op{ nullptr };
+ const ITensor *src_0{ nullptr };
+ const ITensor *src_1{ nullptr };
+ ITensor *dst{ nullptr };
+ std::unique_ptr<cpu::CpuPixelWiseMultiplication> op{ nullptr };
};
NEPixelWiseMultiplication::NEPixelWiseMultiplication()
: _impl(std::make_unique<Impl>())
{
}
-NEPixelWiseMultiplication::NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default;
-NEPixelWiseMultiplication &NEPixelWiseMultiplication::operator=(NEPixelWiseMultiplication &&) = default;
-NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
+NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
- return experimental::NEPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+ return cpu::CpuPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
}
void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
@@ -90,7 +56,7 @@ void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *
_impl->src_0 = input1;
_impl->src_1 = input2;
_impl->dst = output;
- _impl->op = std::make_unique<experimental::NEPixelWiseMultiplication>();
+ _impl->op = std::make_unique<cpu::CpuPixelWiseMultiplication>();
_impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
}
@@ -105,23 +71,21 @@ void NEPixelWiseMultiplication::run()
struct NEComplexPixelWiseMultiplication::Impl
{
- ITensor *src_0{ nullptr };
- ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<experimental::NEComplexPixelWiseMultiplication> op{ nullptr };
+ ITensor *src_0{ nullptr };
+ ITensor *src_1{ nullptr };
+ ITensor *dst{ nullptr };
+ std::unique_ptr<cpu::CpuComplexPixelWiseMultiplication> op{ nullptr };
};
NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
: _impl(std::make_unique<Impl>())
{
}
-NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default;
-NEComplexPixelWiseMultiplication &NEComplexPixelWiseMultiplication::operator=(NEComplexPixelWiseMultiplication &&) = default;
-NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
+NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
- return experimental::NEComplexPixelWiseMultiplication::validate(input1, input2, output, act_info);
+ return cpu::CpuComplexPixelWiseMultiplication::validate(input1, input2, output, act_info);
}
void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
@@ -129,7 +93,7 @@ void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input
_impl->src_0 = input1;
_impl->src_1 = input2;
_impl->dst = output;
- _impl->op = std::make_unique<experimental::NEComplexPixelWiseMultiplication>();
+ _impl->op = std::make_unique<cpu::CpuComplexPixelWiseMultiplication>();
_impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
}
diff --git a/src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp b/src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp
new file mode 100644
index 0000000000..2e560d7490
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuPixelWiseMultiplication.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+ return kernels::CpuPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+}
+
+void CpuPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ auto k = std::make_unique<kernels::CpuPixelWiseMultiplicationKernel>();
+ k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
+ _kernel = std::move(k);
+}
+
+void CpuPixelWiseMultiplication::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+
+Status CpuComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+ return kernels::CpuComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
+}
+
+void CpuComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ auto k = std::make_unique<kernels::CpuComplexPixelWiseMultiplicationKernel>();
+ k->configure(input1, input2, output);
+ _kernel = std::move(k);
+}
+
+void CpuComplexPixelWiseMultiplication::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuPixelWiseMultiplication.h b/src/runtime/cpu/operators/CpuPixelWiseMultiplication.h
new file mode 100644
index 0000000000..b2cd7d529b
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuPixelWiseMultiplication.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_PIXELWISEMULTIPLICATION_H
+#define ARM_COMPUTE_CPU_PIXELWISEMULTIPLICATION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuPixelWiseMultiplicationKernel */
+class CpuPixelWiseMultiplication : public ICpuOperator
+{
+public:
+ /** Default Constructor */
+ CpuPixelWiseMultiplication() = default;
+ /** Initialise the kernel's inputs, output and convertion policy.
+ *
+ * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+ * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+ *
+ * @param[in, out] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output Output tensor info. Data types supported:
+ * - U8, only if both inputs are U8.
+ * - QASYMM8, only if both inputs are QASYMM8.
+ * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
+ * - S16.
+ * - QSYMM16, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are S32 or both are QSYMM16.
+ * - F16, only if @p input1 is F16.
+ * - F32, only if both inputs are F32.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+ * @param[in] rounding_policy Rounding policy.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuPixelWiseMultiplication
+ *
+ * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+ * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+ *
+ * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * @param[in] input2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+ * @param[in] output Output tensor info. Data types supported:
+ * - U8, only if both inputs are U8.
+ * - QASYMM8, only if both inputs are QASYMM8.
+ * - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
+ * - S16.
+ * - QSYMM16, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are S32 or both are QSYMM16.
+ * - F16, only if @p input1 is F16.
+ * - F32, only if both inputs are F32.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+ * @param[in] rounding_policy Rounding policy.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+
+/** Basic function to run @ref kernels::CpuComplexPixelWiseMultiplicationKernel. */
+class CpuComplexPixelWiseMultiplication : public ICpuOperator
+{
+public:
+ /** Default Constructor */
+ CpuComplexPixelWiseMultiplication() = default;
+ /** Initialise the kernel's inputs, output.
+ *
+ * @param[in, out] input1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] input2 Second input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuComplexPixelWiseMultiplication
+ *
+ * @param[in] input1 First input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * @param[in] input2 Second input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_PIXELWISEMULTIPLICATION_H */ \ No newline at end of file