aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/operators
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/operators')
-rw-r--r--src/cpu/operators/CpuActivation.cpp72
-rw-r--r--src/cpu/operators/CpuActivation.h54
-rw-r--r--src/cpu/operators/CpuAdd.cpp46
-rw-r--r--src/cpu/operators/CpuAdd.h68
-rw-r--r--src/cpu/operators/CpuCast.cpp44
-rw-r--r--src/cpu/operators/CpuCast.h71
-rw-r--r--src/cpu/operators/CpuConcatenate.cpp168
-rw-r--r--src/cpu/operators/CpuConcatenate.h76
-rw-r--r--src/cpu/operators/CpuConv2d.cpp253
-rw-r--r--src/cpu/operators/CpuConv2d.h146
-rw-r--r--src/cpu/operators/CpuConvertFullyConnectedWeights.cpp50
-rw-r--r--src/cpu/operators/CpuConvertFullyConnectedWeights.h57
-rw-r--r--src/cpu/operators/CpuCopy.cpp44
-rw-r--r--src/cpu/operators/CpuCopy.h53
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2d.cpp498
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2d.h209
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp135
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h80
-rw-r--r--src/cpu/operators/CpuDequantize.cpp54
-rw-r--r--src/cpu/operators/CpuDequantize.h56
-rw-r--r--src/cpu/operators/CpuDirectConv2d.cpp147
-rw-r--r--src/cpu/operators/CpuDirectConv2d.h105
-rw-r--r--src/cpu/operators/CpuElementwise.cpp124
-rw-r--r--src/cpu/operators/CpuElementwise.h185
-rw-r--r--src/cpu/operators/CpuElementwiseUnary.cpp58
-rw-r--r--src/cpu/operators/CpuElementwiseUnary.h59
-rw-r--r--src/cpu/operators/CpuFill.cpp39
-rw-r--r--src/cpu/operators/CpuFill.h46
-rw-r--r--src/cpu/operators/CpuFlatten.cpp44
-rw-r--r--src/cpu/operators/CpuFlatten.h64
-rw-r--r--src/cpu/operators/CpuFloor.cpp44
-rw-r--r--src/cpu/operators/CpuFloor.h53
-rw-r--r--src/cpu/operators/CpuFullyConnected.cpp496
-rw-r--r--src/cpu/operators/CpuFullyConnected.h147
-rw-r--r--src/cpu/operators/CpuGemm.cpp367
-rw-r--r--src/cpu/operators/CpuGemm.h145
-rw-r--r--src/cpu/operators/CpuGemmConv2d.cpp612
-rw-r--r--src/cpu/operators/CpuGemmConv2d.h203
-rw-r--r--src/cpu/operators/CpuGemmDirectConv2d.cpp222
-rw-r--r--src/cpu/operators/CpuGemmDirectConv2d.h106
-rw-r--r--src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp711
-rw-r--r--src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h174
-rw-r--r--src/cpu/operators/CpuGemmLowpOutputStage.cpp147
-rw-r--r--src/cpu/operators/CpuGemmLowpOutputStage.h86
-rw-r--r--src/cpu/operators/CpuMul.cpp77
-rw-r--r--src/cpu/operators/CpuMul.h105
-rw-r--r--src/cpu/operators/CpuPRelu.h38
-rw-r--r--src/cpu/operators/CpuPermute.cpp44
-rw-r--r--src/cpu/operators/CpuPermute.h56
-rw-r--r--src/cpu/operators/CpuPool2d.cpp158
-rw-r--r--src/cpu/operators/CpuPool2d.h85
-rw-r--r--src/cpu/operators/CpuQuantize.cpp58
-rw-r--r--src/cpu/operators/CpuQuantize.h56
-rw-r--r--src/cpu/operators/CpuReshape.cpp44
-rw-r--r--src/cpu/operators/CpuReshape.h53
-rw-r--r--src/cpu/operators/CpuScale.cpp250
-rw-r--r--src/cpu/operators/CpuScale.h69
-rw-r--r--src/cpu/operators/CpuSoftmax.cpp221
-rw-r--r--src/cpu/operators/CpuSoftmax.h111
-rw-r--r--src/cpu/operators/CpuSub.cpp46
-rw-r--r--src/cpu/operators/CpuSub.h66
-rw-r--r--src/cpu/operators/CpuTranspose.cpp44
-rw-r--r--src/cpu/operators/CpuTranspose.h53
-rw-r--r--src/cpu/operators/CpuWinogradConv2d.cpp839
-rw-r--r--src/cpu/operators/CpuWinogradConv2d.h136
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp721
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.h123
67 files changed, 10071 insertions, 0 deletions
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
new file mode 100644
index 0000000000..d9330a8156
--- /dev/null
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuActivation.h"
+
+#include "src/common/IOperator.h"
+#include "src/common/utils/LegacySupport.h"
+#include "src/cpu/CpuContext.h"
+#include "src/cpu/kernels/CpuActivationKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
+{
+ auto k = std::make_unique<kernels::CpuActivationKernel>();
+ k->configure(input, output, activation_info);
+ _kernel = std::move(k);
+}
+
+Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+{
+ return kernels::CpuActivationKernel::validate(input, output, activation_info);
+}
+
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+{
+ TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
+ TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
+ auto info = detail::convert_to_activation_info(act);
+
+ if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+ {
+ return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
+ }
+
+ auto act_op = std::make_unique<cpu::CpuActivation>();
+ act_op->configure(&src_info, &dst_info, info);
+
+ auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
+ if(op == nullptr)
+ {
+ ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
+ return std::make_tuple(nullptr, StatusCode::OutOfMemory);
+ }
+ op->set_internal_operator(std::move(act_op));
+
+ return std::make_tuple(op, StatusCode::Success);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
new file mode 100644
index 0000000000..9b97c9d24f
--- /dev/null
+++ b/src/cpu/operators/CpuActivation.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ACTIVATION_H
+#define ARM_COMPUTE_CPU_ACTIVATION_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuActivationKernel */
+class CpuActivation : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+ * @param[out] output Destination tensor info. Data type supported: same as @p src
+ * @param[in] activation_info Activation layer parameters.
+ */
+ void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuActivation::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
new file mode 100644
index 0000000000..42a7b99ceb
--- /dev/null
+++ b/src/cpu/operators/CpuAdd.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuAdd.h"
+
+#include "src/cpu/kernels/CpuAddKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ auto k = std::make_unique<kernels::CpuAddKernel>();
+ k->configure(src0, src1, dst, policy);
+ _kernel = std::move(k);
+}
+
+Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+ return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
new file mode 100644
index 0000000000..d8ec620aeb
--- /dev/null
+++ b/src/cpu/operators/CpuAdd.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ADD_H
+#define ARM_COMPUTE_CPU_ADD_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuAddKernel */
+class CpuAdd : public ICpuOperator
+{
+public:
+ /** Initialise the kernel's input, dst and border mode.
+ *
+ * Valid configurations (src0,src1) -> dst :
+ *
+ * - (U8,U8) -> U8
+ * - (S16,S16) -> S16
+ * - (S32,S32) -> S32
+ * - (F16,F16) -> F16
+ * - (F32,F32) -> F32
+ * - (QASYMM8,QASYMM8) -> QASYMM8
+ * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+ * - (QSYMM16,QSYMM16) -> QSYMM16
+ *
+ * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+ * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+ * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+ * @param[in] policy Overflow policy.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ *
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuAdd::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ADD_H */
diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
new file mode 100644
index 0000000000..d0980c75b6
--- /dev/null
+++ b/src/cpu/operators/CpuCast.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuCast.h"
+
+#include "src/cpu/kernels/CpuCastKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+{
+ auto k = std::make_unique<kernels::CpuCastKernel>();
+ k->configure(src, dst, policy);
+ _kernel = std::move(k);
+}
+
+Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+ return kernels::CpuCastKernel::validate(src, dst, policy);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h
new file mode 100644
index 0000000000..5e5f3e0ca6
--- /dev/null
+++ b/src/cpu/operators/CpuCast.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CAST_H
+#define ARM_COMPUTE_CPU_CAST_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuCastKernel */
+class CpuCast : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * Input data type must be different than output data type.
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:-----------------------------------------------|
+ * |QASYMM8_SIGNED | S16, S32, F32, F16 |
+ * |QASYMM8 | U16, S16, S32, F32, F16 |
+ * |U8 | U16, S16, S32, F32, F16 |
+ * |U16 | U8, U32 |
+ * |S16 | QASYMM8_SIGNED, U8, S32 |
+ * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 |
+ * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 |
+ * |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
+ *
+ * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in] policy Conversion policy.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuCast::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
new file mode 100644
index 0000000000..92c1ef6bdf
--- /dev/null
+++ b/src/cpu/operators/CpuConcatenate.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuConcatenate.h"
+
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis)
+{
+ ARM_COMPUTE_ERROR_ON(dst == nullptr);
+
+ _axis = axis;
+ _num_srcs = srcs_vector.size();
+
+ TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type());
+ ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis));
+
+ unsigned int offset = 0;
+
+ for(unsigned int i = 0; i < _num_srcs; ++i)
+ {
+ switch(axis)
+ {
+ case Window::DimX:
+ {
+ auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>();
+ kernel->configure(srcs_vector.at(i), offset, dst);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case Window::DimY:
+ {
+ auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>();
+ kernel->configure(srcs_vector.at(i), offset, dst);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case Window::DimZ:
+ {
+ auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>();
+ kernel->configure(srcs_vector.at(i), offset, dst);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ case 3:
+ {
+ auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>();
+ kernel->configure(srcs_vector.at(i), offset, dst);
+ _concat_kernels.emplace_back(std::move(kernel));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+ offset += srcs_vector.at(i)->dimension(axis);
+ }
+}
+
+Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
+
+ unsigned int offset = 0;
+ for(const auto &src : srcs_vector)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ switch(axis)
+ {
+ case Window::DimX:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst));
+ break;
+ }
+ case Window::DimY:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst));
+ break;
+ }
+ case Window::DimZ:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst));
+ break;
+ }
+ case 3:
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst));
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Axis not supported");
+ }
+ offset += src->dimension(axis);
+ }
+
+ if(dst->total_size() != 0)
+ {
+ TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+ }
+
+ return Status{};
+}
+
+void CpuConcatenate::run(ITensorPack &tensors)
+{
+ if(tensors.empty())
+ {
+ ARM_COMPUTE_ERROR("No inputs provided");
+ }
+
+ if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+ {
+ ARM_COMPUTE_ERROR("Configured with different number of inputs");
+ }
+
+ int i = 0;
+ for(auto &k : _concat_kernels)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+ pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+ NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack);
+ ++i;
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
new file mode 100644
index 0000000000..001ac68162
--- /dev/null
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONCATENATE_H
+#define ARM_COMPUTE_CPU_CONCATENATE_H
+
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3).
+ */
+class CpuConcatenate : public ICpuOperator
+{
+public:
+ CpuConcatenate() = default;
+ /** Configure operator for a given list of arguments
+ *
+ * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+ * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel,
+ * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel.
+ *
+ * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Output tensor. Data types supported: Same as @p srcs_vector.
+ * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+ */
+ void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuConcatenate::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+
+private:
+ std::vector<std::unique_ptr<ICpuKernel>> _concat_kernels{};
+ unsigned int _num_srcs{ 0 };
+ unsigned int _axis{ 0 };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
new file mode 100644
index 0000000000..3878e0de58
--- /dev/null
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuConv2d.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuConv2d::CpuConv2d()
+ : _function()
+{
+}
+
+CpuConv2d::~CpuConv2d() = default;
+
+void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_UNUSED(num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups));
+
+ const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+ switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+ {
+ case ConvolutionMethod::WINOGRAD:
+ {
+ auto f = std::make_unique<CpuWinogradConv2d>();
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
+ _function = std::move(f);
+ break;
+ }
+ case ConvolutionMethod::GEMM:
+ {
+ auto f = std::make_unique<CpuGemmConv2d>();
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
+ _function = std::move(f);
+ break;
+ }
+ case ConvolutionMethod::GEMM_CONV2D:
+ {
+ auto f = std::make_unique<CpuGemmDirectConv2d>();
+ f->configure(input, weights, biases, output, info);
+ _function = std::move(f);
+ break;
+ }
+ case ConvolutionMethod::DIRECT:
+ {
+ auto f = std::make_unique<CpuDirectConv2d>();
+ f->configure(input, weights, biases, output, conv_info, act_info);
+ _function = std::move(f);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+
+ _aux_mem = _function->workspace();
+}
+
+Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
+
+ const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
+ switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+ {
+ case ConvolutionMethod::WINOGRAD:
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+ break;
+ case ConvolutionMethod::GEMM:
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
+ break;
+ case ConvolutionMethod::GEMM_CONV2D:
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
+ break;
+ case ConvolutionMethod::DIRECT:
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuDirectConv2d::validate(input, weights, biases, output, conv_info, act_info));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported.");
+ break;
+ }
+
+ return Status{};
+}
+
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+ const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
+ ARM_COMPUTE_UNUSED(weights_info);
+
+ const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+ const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+
+ /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+ using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
+ using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+ const std::vector<ConfigurationMethod> known_configs =
+ {
+ // Alexnet
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+ // VGG16 / VGG19
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+ // Mobilenet 224
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+ // Mobilenet 160
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
+ };
+
+ const auto find_config = [&](ConfigurationMethod c)
+ {
+ const ConvolutionConfiguration config = c.first;
+ const PadStrideInfo info = std::get<3>(config);
+
+ return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+ && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+ && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ };
+
+ std::vector<ConfigurationMethod>::const_iterator found;
+ if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ {
+ return (*found).second;
+ }
+
+ if(dilation != Size2D(1U, 1U))
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ else
+ {
+ // SRGAN
+ // Output might not be initialized when it is an internal tensor of the layer using the convolution
+ if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
+ && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+ if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
+ {
+ return ConvolutionMethod::FFT;
+ }
+ if(input->dimension(idx_c) < 16)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ // This heuristics only applies to F16 data type on A55r1
+ if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+ {
+ // Exclude known bad winograd configs (and defaults to GEMM)
+ const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
+ {
+ // Squeezenet_V1_1 fire2 and fire3
+ ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ // Squeezenet_V1_1 fire6 and fire7
+ ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ // Squeezenet_V1_1 fire8 and fire9
+ ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ };
+ const auto find_conv_config = [&](ConvolutionConfiguration c)
+ {
+ const PadStrideInfo info = std::get<3>(c);
+
+ return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+ && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+ && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ };
+
+ bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
+ find_conv_config)
+ != known_bad_winograd_f16_with_fastmath_configs.end();
+ if(found_bad)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ // For 1x1 convolutions run the default GEMM
+ if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+
+ if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+ {
+ return ConvolutionMethod::WINOGRAD;
+ }
+ if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+ {
+ return ConvolutionMethod::GEMM_CONV2D;
+ }
+ return ConvolutionMethod::GEMM;
+ }
+}
+
+void CpuConv2d::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+ _function->run(tensors);
+}
+
+void CpuConv2d::prepare(ITensorPack &tensors)
+{
+ _function->prepare(tensors);
+}
+
+experimental::MemoryRequirements CpuConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
new file mode 100644
index 0000000000..daf068f361
--- /dev/null
+++ b/src/cpu/operators/CpuConv2d.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to simulate a convolution layer. This function calls one of the following functions:
+ * -# @ref CpuGemm (executed only in case GEMM is required for the operation)
+ * -# @ref CpuWinogradConv2d (executed only in case Winograd is required for the operation)
+ * -# @ref CpuDirectConv2d (executed only in case Direct Convolution is required for the operation)
+ *
+ *
+ * The function selects one of the algorithms mentioned above based on:
+ * - The size of the kernel
+ * - Number of input/output feature maps
+ * - Amount of memory needed
+ *
+ * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
+ *
+ * FP32 Algorithm| Filter Size | Input/Output feature maps |
+ * --------------|----------------------------------------------------|-------------------------------------------|
+ * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 |
+ * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps |
+ * DirectConv | 9x9 | |
+ * GEMM | Any size | |
+ *
+ * Winograd 5x5 requires fast maths enabled.
+ *
+ * FP16 Algorithm| Filter Size |
+ * --------------|------------------|
+ * Winograd | Not supported |
+ * FFT | Not supported |
+ * DirectConv | 9x9 |
+ * GEMM | Any size |
+ *
+ *
+ */
+class CpuConv2d : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuConv2d();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConv2d);
+ /** Default destructor */
+ ~CpuConv2d();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p src.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+ */
+ void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
+ *
+ * Similar to CpuConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
+ unsigned int num_groups = 1);
+ /** Static function to check if given info will return the convolution called by @ref CpuConv2d
+ *
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p src.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ *
+ * @return the Convolution Method Hint
+ */
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ std::unique_ptr<ICpuOperator> _function;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000000..da744fc100
--- /dev/null
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+{
+ auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
+ k->configure(src, dst, original_src_shape, data_layout);
+ _kernel = std::move(k);
+}
+
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+{
+ return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
+}
+
+void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
+{
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
+}
+} // namesapce cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
new file mode 100644
index 0000000000..ea70eee134
--- /dev/null
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H
+#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */
+class CpuConvertFullyConnectedWeights : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] src Source tensor to permute. Data types supported: All
+ * @param[out] dst Destintation tensor. Data types supported: Same as @p src
+ * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+ * @param[in] data_layout The data layout the weights have been trained in.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuConvertFullyConnectedWeights::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
new file mode 100644
index 0000000000..2eecc2390e
--- /dev/null
+++ b/src/cpu/operators/CpuCopy.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuCopy.h"
+
+#include "src/cpu/kernels/CpuCopyKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuCopyKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::CpuCopyKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuCopy.h b/src/cpu/operators/CpuCopy.h
new file mode 100644
index 0000000000..9ffde4e781
--- /dev/null
+++ b/src/cpu/operators/CpuCopy.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_COPY_H
+#define ARM_COMPUTE_CPU_COPY_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuCopyKernel */
+class CpuCopy : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] src Source tensor info. Data type supported: All
+ * @param[out] dst Destination info. Data type supported: Same as @p src
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuCopy::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_COPY_H */
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..071690e7a6
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/InfoHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ if(!is_data_type_quantized_per_channel(weights->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
+ const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+ info.pad_stride_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+ info.pad_stride_info.pad_bottom());
+
+ if(biases != nullptr)
+ {
+ const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
+
+ // Validate Activation Layer
+ if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+ }
+ return Status{};
+}
+} // namespace
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+ dst, info));
+
+ _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+ _has_bias = biases != nullptr;
+ _is_nchw = src->data_layout() == DataLayout::NCHW;
+ _permute = _is_nchw;
+ _is_prepared = false;
+
+ // Configure pipeline
+ _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+
+ _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
+ if(_is_nchw)
+ {
+ _permute_input = std::make_unique<cpu::CpuPermute>();
+ _permute_weights = std::make_unique<cpu::CpuPermute>();
+ _permute_output = std::make_unique<cpu::CpuPermute>();
+
+ auto input_perm = std::make_unique<TensorInfo>();
+ auto weights_perm = std::make_unique<TensorInfo>();
+ auto output_perm = std::make_unique<TensorInfo>();
+
+ // Configure the function to transform the input tensor from NCHW -> NHWC
+ _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+ input_perm->set_data_layout(DataLayout::NHWC);
+
+ // Configure the function to transform the weights tensor from IHW -> HWI
+ _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+ weights_perm->set_data_layout(DataLayout::NHWC);
+
+ output_perm->set_data_layout(DataLayout::NHWC);
+ output_perm->set_quantization_info(dst->quantization_info());
+
+ // Configure optimized depthwise
+ _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info);
+
+ // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
+ output_perm->set_data_layout(DataLayout::NHWC);
+ _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+ }
+ else
+ {
+ _dwc_optimized_func->configure(src, weights, biases, dst, info);
+ }
+
+ // Configure activation
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+ _activationlayer_function->configure(dst, nullptr, info.act_info);
+ }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
+{
+ return validate_arguments_optimized(src, weights, biases, dst, info);
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ prepare(tensors);
+
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
+ auto workspace = tensors.get_tensor(TensorType::ACL_INT_3);
+ auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+ // Permute input
+ if(_permute)
+ {
+ ITensorPack pack;
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+ pack.add_tensor(TensorType::ACL_SRC, src);
+ pack.add_tensor(TensorType::ACL_DST, src_perm);
+ _permute_input->run(pack);
+ }
+
+ // Run assembly function
+ if(_is_nchw)
+ {
+ auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+ auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, src_perm);
+ pack.add_tensor(TensorType::ACL_SRC_1, weights_perm);
+ pack.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack.add_tensor(TensorType::ACL_INT_0, workspace);
+ pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+ pack.add_tensor(TensorType::ACL_DST, dst_perm);
+ _dwc_optimized_func->run(pack);
+ }
+ else
+ {
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto weights = tensors.get_tensor(TensorType::ACL_SRC_1);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, src);
+ pack.add_tensor(TensorType::ACL_SRC_1, weights);
+ pack.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack.add_tensor(TensorType::ACL_INT_0, workspace);
+ pack.add_tensor(TensorType::ACL_INT_1, packed_weights);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _dwc_optimized_func->run(pack);
+ }
+
+ // Permute output
+ if(_is_nchw)
+ {
+ ITensorPack pack;
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+ pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _permute_output->run(pack);
+ }
+
+ // Run activation
+ if(_is_activationlayer_enabled)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, dst);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _activationlayer_function->run(pack);
+ }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
+
+ // Permute weights
+ if(_permute)
+ {
+ auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, weights);
+ pack.add_tensor(TensorType::ACL_DST, permuted_weights);
+ _permute_weights->run(pack);
+
+ weights->mark_as_unused();
+
+ ITensorPack pack_opt;
+ pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights);
+ pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+ // Prepare optimized function
+ _dwc_optimized_func->prepare(pack_opt);
+ }
+ else
+ {
+ ITensorPack pack_opt;
+ pack_opt.add_tensor(TensorType::ACL_SRC_1, weights);
+ pack_opt.add_tensor(TensorType::ACL_SRC_2, bias);
+ pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights);
+
+ // Prepare optimized function
+ _dwc_optimized_func->prepare(pack_opt);
+ }
+
+ _is_prepared = true;
+ }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
+ dst, info));
+
+ _is_nchw = src->data_layout() == DataLayout::NCHW;
+ _is_prepared = !_is_nchw;
+
+ ITensorInfo *input_to_use = src;
+ const ITensorInfo *weights_to_use = weights;
+ ITensorInfo *output_to_use = dst;
+
+ auto input_perm = std::make_unique<TensorInfo>();
+ auto weights_perm = std::make_unique<TensorInfo>();
+ auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+
+ if(_is_nchw)
+ {
+ _permute_input = std::make_unique<cpu::CpuPermute>();
+ _permute_weights = std::make_unique<cpu::CpuPermute>();
+
+ _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U));
+ input_perm->set_data_layout(DataLayout::NHWC);
+ input_to_use = input_perm.get();
+
+ _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U));
+ weights_perm->set_data_layout(DataLayout::NHWC);
+ weights_to_use = weights_perm.get();
+
+ output_to_use = output_perm.get();
+ }
+
+ _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
+ _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
+
+ if(_is_nchw)
+ {
+ _permute_output = std::make_unique<cpu::CpuPermute>();
+ _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
+ output_perm->set_data_layout(DataLayout::NHWC);
+ }
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = info.act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function = std::make_unique<cpu::CpuActivation>();
+ _activationlayer_function->configure(dst, nullptr, info.act_info);
+ }
+}
+
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const ConvolutionInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ if(src->data_layout() == DataLayout::NCHW)
+ {
+ TensorShape permuted_input_shape = src->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+ permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
+ permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
+
+ const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+ }
+
+ // Validate Activation Layer
+ if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
+ }
+
+ return Status{};
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
+{
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+ if(_is_nchw)
+ {
+ prepare(tensors);
+ auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
+ auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, src);
+ pack.add_tensor(TensorType::ACL_DST, src_perm);
+ _permute_input->run(pack);
+
+ ITensorPack pack_depth;
+ pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm);
+ pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
+ pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+ pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ }
+ else
+ {
+ ITensorPack pack_depth;
+ pack_depth.add_tensor(TensorType::ACL_SRC_0, src);
+ pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
+ pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
+ pack_depth.add_tensor(TensorType::ACL_DST, dst);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ }
+
+ if(_is_nchw)
+ {
+ ITensorPack pack;
+ auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
+ pack.add_tensor(TensorType::ACL_SRC, dst_perm);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _permute_output->run(pack);
+ }
+
+ if(_is_activationlayer_enabled)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, dst);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _activationlayer_function->run(pack);
+ }
+}
+
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
+
+ ARM_COMPUTE_ERROR_ON(!weights->is_used());
+
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, weights);
+ pack.add_tensor(TensorType::ACL_DST, weights_perm);
+
+ _permute_weights->run(pack);
+ weights->mark_as_unused();
+ _is_prepared = true;
+ }
+}
+
+void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+{
+ _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.configure(src, weights, biases, dst, info);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.configure(src, weights, biases, dst, info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
+
+Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+ DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
+ switch(depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
+ }
+}
+
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const ConvolutionInfo &info)
+{
+ if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+ {
+ return DepthwiseConvolutionFunction::OPTIMIZED;
+ }
+ else
+ {
+ return DepthwiseConvolutionFunction::GENERIC;
+ }
+}
+
+void CpuDepthwiseConv2d::run(ITensorPack &tensors)
+{
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.run(tensors);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.run(tensors);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+ }
+}
+
+void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
+{
+ switch(_depth_conv_func)
+ {
+ case DepthwiseConvolutionFunction::OPTIMIZED:
+ _func_optimized.prepare(tensors);
+ break;
+ case DepthwiseConvolutionFunction::GENERIC:
+ _func_generic.prepare(tensors);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured");
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
new file mode 100644
index 0000000000..15e52ef515
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+#include "src/cpu/operators/CpuPermute.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to execute a depthwise convolution.
+ */
+class CpuDepthwiseConv2d : public ICpuOperator
+{
+public:
+ /** Default constructor */
+ CpuDepthwiseConv2d() = default;
+ /** Initialize the function's source, destination, weights and convolution information.
+ *
+ * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src.
+ * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuDepthwiseConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
+ *
+ * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] dst Destination tensor. Data type supported: same as @p src.
+ * @param[in] info Depthwise convolution meta-data.
+ *
+ * @return a Depthwise Convolution Function
+ */
+ static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const ConvolutionInfo &info);
+
+ // Inherited methods overriden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+
+private:
+ /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
+ *
+ * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
+ *
+ * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
+ * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present
+ * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present
+ * -# @ref CpuActivation if fused activation is required
+ *
+ */
+ class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator
+ {
+ public:
+ /** Default constructor */
+ CpuDepthwiseConv2dOptimizedInternal() = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+ /** Default move constructor */
+ CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete;
+ /** Default move assignment operator */
+ CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default;
+ /** Default destructor */
+ ~CpuDepthwiseConv2dOptimizedInternal() = default;
+ /** Initialize the function's source, destination, kernels and border_size.
+ *
+ * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+ // Inherited methods overriden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+
+ private:
+ std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_input{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_output{ nullptr };
+ std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
+ bool _has_bias{ false };
+ bool _is_quantized{ false };
+ bool _is_nchw{ true };
+ bool _permute{ false };
+ bool _is_activationlayer_enabled{ false };
+ bool _is_prepared{ false };
+ };
+
+ /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
+ *
+ * -# @ref CpuDepthwiseConv2dNativeKernel
+ *
+ */
+ class CpuDepthwiseConv2dGeneric : public ICpuOperator
+ {
+ public:
+ /** Default constructor */
+ CpuDepthwiseConv2dGeneric() = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete;
+ /** Default move constructor */
+ CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete;
+ /** Default move assignment operator */
+ CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default;
+ /** Default destructor */
+ ~CpuDepthwiseConv2dGeneric() = default;
+ /** Initialize the function's source, destination, weights and convolution information.
+ *
+ * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src.
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
+ * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuDepthwiseConv2dGeneric::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+
+ private:
+ std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_input{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
+ std::unique_ptr<CpuPermute> _permute_output{ nullptr };
+ std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
+ bool _is_nchw{ true };
+ bool _is_prepared{ false };
+ bool _is_activationlayer_enabled{ false };
+ };
+
+ DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
+ CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
+ CpuDepthwiseConv2dGeneric _func_generic{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H */
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
new file mode 100644
index 0000000000..a353a66dc2
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/utils/AssemblyUtils.h"
+#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
+{
+ std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
+ bool is_prepared{ false };
+ experimental::MemoryRequirements mem_req{};
+};
+
+#ifndef DOXYGEN_SKIP_THIS
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
+ : _pImpl(std::make_unique<LocalImpl>())
+{
+}
+#endif /* DOXYGEN_SKIP_THIS */
+
+CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default;
+
+void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
+{
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+ _pImpl->is_prepared = false;
+
+ // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+ if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+ {
+ return;
+ }
+
+ auto dwc_wrapper = std::make_unique<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel>();
+ ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr);
+ dwc_wrapper->configure(src, weights, bias, dst, info, ci);
+
+ // Compute memory requirements for assembly kernels
+ constexpr size_t alignment = 4096;
+ _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads, src->dimension(0)), alignment });
+ _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+ _pImpl->asm_kernel = std::move(dwc_wrapper);
+}
+
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+{
+ return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
+}
+
+experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const
+{
+ return _pImpl->mem_req;
+}
+
+bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+{
+ arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
+ return act.type != arm_gemm::Activation::Type::None;
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+ prepare(tensors);
+
+ NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors);
+}
+
+void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+ if(!_pImpl->is_prepared)
+ {
+ // Pack weights and bias
+ const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1);
+
+ const auto weights_ptr = weights->buffer() + weights->info()->offset_first_element_in_bytes();
+ const auto bias_ptr = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr;
+ auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
+
+ const auto weights_shape = weights->info()->tensor_shape();
+ const auto weights_padding = weights->info()->padding();
+
+ const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
+ const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+ _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
+
+ weights->mark_as_unused();
+ if(bias != nullptr)
+ {
+ bias->mark_as_unused();
+ }
+ _pImpl->is_prepared = true;
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
new file mode 100644
index 0000000000..af202ced5b
--- /dev/null
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Depthwise convolution assembly kernel glue */
+class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator
+{
+public:
+ CpuDepthwiseConv2dAssemblyDispatch();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch);
+ ~CpuDepthwiseConv2dAssemblyDispatch();
+ /** Initialize the function's source, destination, kernels and border_size.
+ *
+ * @note Supports only NHWC format
+ *
+ * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM].
+ * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
+ * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+ * Data type supported: same as @p src or S32 if @p src is quantized.
+ * @param[out] dst Destination tensor info. Data type supported: same as @p src.
+ * @param[in] info Depthwise convolution meta-data.
+ */
+ void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+ /** Checks if activation is supported by the assembly kernels
+ *
+ * @param[in] activation Activation to check
+ *
+ * @return True if activation is supported else false
+ */
+ static bool is_activation_supported(const ActivationLayerInfo &activation);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ struct LocalImpl;
+ std::unique_ptr<LocalImpl> _pImpl;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */
diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
new file mode 100644
index 0000000000..7c03571f40
--- /dev/null
+++ b/src/cpu/operators/CpuDequantize.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDequantize.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuDequantizeKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::CpuDequantizeKernel::validate(src, dst);
+}
+
+void CpuDequantize::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ prepare(tensors);
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDequantize.h b/src/cpu/operators/CpuDequantize.h
new file mode 100644
index 0000000000..dbfc0c612a
--- /dev/null
+++ b/src/cpu/operators/CpuDequantize.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H
+#define ARM_COMPUTE_CPU_DEQUANTIZE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */
+class CpuDequantize : public ICpuOperator
+{
+public:
+ /** Configure the kernel.
+ *
+ * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+ * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuDequantize::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */
diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
new file mode 100644
index 0000000000..ec52dbf153
--- /dev/null
+++ b/src/cpu/operators/CpuDirectConv2d.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuDirectConv2d.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuDirectConv2d::~CpuDirectConv2d() = default;
+
+CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
+ _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+{
+}
+
+void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+ _output_stage_kernel = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>();
+ _conv_kernel = std::make_unique<kernels::CpuDirectConv2dKernel>();
+ _input_border_handler = std::make_unique<NEFillBorderKernel>();
+
+ // Free accumulator
+ if(_accumulator.buffer() != nullptr)
+ {
+ _accumulator.allocator()->free();
+ }
+
+ _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY;
+
+ // Check if bias should be added in the convolution result
+ _has_bias = (bias != nullptr);
+
+ _conv_kernel->configure(src, weights, dst, conv_info);
+ if(_has_bias)
+ {
+ _output_stage_kernel->configure(dst, bias);
+ }
+ _is_padding_required = !_conv_kernel->border_size().empty();
+
+ if(_is_padding_required)
+ {
+ // Add zero padding XY
+ _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ }
+
+ //Configure Activation Layer
+ _is_activationlayer_enabled = act_info.enabled();
+ if(_is_activationlayer_enabled)
+ {
+ _activationlayer_function = std::make_unique<CpuActivation>();
+ _activationlayer_function->configure(dst, dst, act_info);
+ }
+}
+
+Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+
+ // output might not be initialized since it can be an intermediate tensor of another layer
+ DataType data_type = src->data_type();
+ TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+ // Validate Convolution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
+
+ if(bias != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+ "Biases size and number of input feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // Validate bias kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
+
+ if(act_info.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
+ }
+
+ return Status{};
+}
+
+void CpuDirectConv2d::run(ITensorPack &tensors)
+{
+ MemoryGroupResourceScope scope_mg(_memory_group);
+
+ auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
+ auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ if(_is_padding_required)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_DST, src);
+ NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+ }
+ NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
+ if(_has_bias)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, dst);
+ pack.add_tensor(TensorType::ACL_SRC_1, bias);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
+ }
+
+ if(_is_activationlayer_enabled)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC, dst);
+ pack.add_tensor(TensorType::ACL_DST, dst);
+ _activationlayer_function->run(pack);
+ }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
new file mode 100644
index 0000000000..fa8d61e083
--- /dev/null
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuDirectConv2dKernel.h"
+#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Function to run the direct convolution.
+ *
+ * This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel for the input
+ * -# @ref kernels::CpuDirectConv2dOutputStageKernel
+ * -# @ref kernels::CpuDirectConv2dKernel
+ */
+class CpuDirectConv2d : public ICpuOperator
+{
+public:
+ CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ ~CpuDirectConv2d();
+ /** Set the input, weights, biases and output tensors.
+ *
+ * @note: DirectConvolution only works in the following configurations:
+ * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+ * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
+ * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32
+ *
+ * @param[in, out] src Input tensor info. Data types supported: F16/F32.
+ * @param[in] weights Set of kernels to convolve the input volume.
+ * Supported sizes: 1x1, 3x3 and 5x5.
+ * The 3rd dimension must be the same as the input's volume 3rd dimension.
+ * Data type supported: Same as @p src.
+ * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src.
+ * @param[out] dst Output tensor info.
+ * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ */
+ void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuDirectConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+
+private:
+ MemoryGroup _memory_group;
+ std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel;
+ std::unique_ptr<kernels::CpuDirectConv2dKernel> _conv_kernel;
+ std::unique_ptr<NEFillBorderKernel> _input_border_handler;
+ std::unique_ptr<CpuActivation> _activationlayer_function;
+ Tensor _accumulator;
+ bool _has_bias{ false };
+ bool _is_activationlayer_enabled{ false };
+ unsigned int _dim_split{ 0 };
+ bool _is_padding_required{ false };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */
diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
new file mode 100644
index 0000000000..4f767434f3
--- /dev/null
+++ b/src/cpu/operators/CpuElementwise.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuElementwise.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/CpuElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuElementwiseBase::run(ITensorPack &tensors)
+{
+ // If the kernel has been configured, use the window from the kernel.
+ if(_kernel->is_window_configured())
+ {
+ ICpuOperator::run(tensors);
+ return;
+ }
+
+ auto src0_info = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info();
+ auto src1_info = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info();
+ auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape());
+ ICpuOperator::run(tensors, shape_and_window.second);
+}
+
+template <ArithmeticOperation op>
+void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuArithmeticKernel>();
+ k->configure(op, src0, src1, dst);
+ _kernel = std::move(k);
+}
+
+template <ArithmeticOperation op>
+Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+ return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst);
+}
+
+template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
+template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
+template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
+template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
+
+void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuDivisionKernel>();
+ k->configure(src0, src1, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+ return kernels::CpuDivisionKernel::validate(src0, src1, dst);
+}
+
+void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuPowerKernel>();
+ k->configure(src0, src1, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+ return kernels::CpuPowerKernel::validate(src0, src1, dst);
+}
+
+template <ComparisonOperation COP>
+void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuComparisonKernel>();
+ k->configure(COP, src0, src1, dst);
+ _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+{
+ return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
+}
+
+void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
+{
+ auto k = std::make_unique<kernels::CpuComparisonKernel>();
+ k->configure(op, src0, src1, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
+{
+ return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
+}
+
+// Supported Specializations
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
new file mode 100644
index 0000000000..b6c61cf245
--- /dev/null
+++ b/src/cpu/operators/CpuElementwise.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuElementwiseBase : public ICpuOperator
+{
+public:
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power
+ *
+ * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32
+ * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32.
+ */
+template <ArithmeticOperation op>
+class CpuElementwiseArithmetic : public CpuElementwiseBase
+{
+public:
+ /** Configure the operator
+ *
+ * @param[in] src0 The first source tensor information.
+ * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor.
+ * @param[out] dst The output tensor information.
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuElementwiseArithmetic::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */
+using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>;
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */
+using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>;
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */
+using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>;
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
+ *
+ * @note The tensor data type for the inputs must be S32/F16/F32.
+ * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
+ */
+class CpuElementwiseDivision : public CpuElementwiseBase
+{
+public:
+ /** Initialise the kernel's inputs, dst and conversion policy.
+ *
+ * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32.
+ * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+ * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuElementwiseDivision::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ * @note For an exponent that is a float, this function will only work with a positive base.
+ */
+class CpuElementwisePower : public CpuElementwiseBase
+{
+public:
+ /** Initialise the kernel's inputs, dst and conversion policy.
+ *
+ * @param[in, out] src0 First tensor input info. Data types supported: F16/F32.
+ * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+ * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuElementwisePower::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+class CpuElementwiseComparison : public CpuElementwiseBase
+{
+public:
+ /** Initialise the kernel's inputs, dst and conversion policy.
+ *
+ * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+ * @param[out] dst Output tensor info. Data types supported: U16/U32.
+ * @param[in] op Comparison Operation to be performed.
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuElementwiseComparison::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+template <ComparisonOperation op>
+class CpuElementwiseComparisonStatic : public CpuElementwiseBase
+{
+public:
+ /** Initialise the kernel's inputs, dst and conversion policy.
+ *
+ * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0.
+ * @param[out] dst Output tensor info. Data types supported: U16/U32.
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuElementwiseComparisonStatic::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+};
+
+/** Basic function to run equal comparison. */
+using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
+/** Basic function to run not equal comparison. */
+using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+/** Basic function to run greater comparison. */
+using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
+/** Basic function to run greater-equal comparison. */
+using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+/** Basic function to run less comparison. */
+using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
+/** Basic function to run less-equal comparison. */
+using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ \ No newline at end of file
diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
new file mode 100644
index 0000000000..7cf1488c44
--- /dev/null
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuElementwiseUnary.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+using KernelType = kernels::CpuElementwiseUnaryKernel;
+
+void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
+{
+ auto k = std::make_unique<KernelType>();
+ k->configure(op, src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
+{
+ return KernelType::validate(op, src, dst);
+}
+
+void CpuElementwiseUnary::run(ITensorPack &tensors)
+{
+ if(_kernel->is_window_configured())
+ {
+ ICpuOperator::run(tensors);
+ return;
+ }
+
+ auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info();
+ ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
new file mode 100644
index 0000000000..5e8e98d047
--- /dev/null
+++ b/src/cpu/operators/CpuElementwiseUnary.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
+
+#include "arm_compute/core/Types.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuElementwiseUnary : public ICpuOperator
+{
+public:
+ /** Initialize the function
+ *
+ * @param[in] op Unary operation to execute
+ * @param[in] src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+ * @param[out] dst Output tensor information. Data types supported: Same as @p src.
+ */
+ void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuElementwiseUnary::configure()
+ *
+ * @return a status
+ */
+ static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ \ No newline at end of file
diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
new file mode 100644
index 0000000000..c0b48f5830
--- /dev/null
+++ b/src/cpu/operators/CpuFill.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFill.h"
+
+#include "src/cpu/kernels/CpuFillKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value)
+{
+ auto k = std::make_unique<kernels::CpuFillKernel>();
+ k->configure(tensor, constant_value);
+ _kernel = std::move(k);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
new file mode 100644
index 0000000000..1cb99f5662
--- /dev/null
+++ b/src/cpu/operators/CpuFill.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FILL_H
+#define ARM_COMPUTE_CPU_FILL_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuFillKernel */
+class CpuFill : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in,out] tensor Tensor to fill. Supported data types: All
+ * @param[in] constant_value The value used to fill the planes of the tensor
+ */
+ void configure(const ITensorInfo *tensor, PixelValue constant_value);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FILL_H */
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
new file mode 100644
index 0000000000..685e5b9238
--- /dev/null
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFlatten.h"
+
+#include "src/cpu/kernels/CpuReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuReshapeKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::CpuReshapeKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.h b/src/cpu/operators/CpuFlatten.h
new file mode 100644
index 0000000000..0e9fcbdc35
--- /dev/null
+++ b/src/cpu/operators/CpuFlatten.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FLATTEN_H
+#define ARM_COMPUTE_CPU_FLATTEN_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to flatten a given input */
+class CpuFlatten : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------|
+ * |All |All |
+ *
+ * @param[in] src Source tensor to flatten with at least 3 dimensions.
+ * The dimensions above the third will be interpreted as batches. Data types supported: All
+ * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where:
+ * w = width input tensor, h = height input tensor and d = depth input tensor.
+ * Data type supported: same as @p src
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuFlatten::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FLATTEN_H */
diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
new file mode 100644
index 0000000000..55f645847f
--- /dev/null
+++ b/src/cpu/operators/CpuFloor.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFloor.h"
+
+#include "src/cpu/kernels/CpuFloorKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuFloorKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::CpuFloorKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFloor.h b/src/cpu/operators/CpuFloor.h
new file mode 100644
index 0000000000..6082f98867
--- /dev/null
+++ b/src/cpu/operators/CpuFloor.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FLOOR_H
+#define ARM_COMPUTE_CPU_FLOOR_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuFloorKernel */
+class CpuFloor : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[in] dst Destination tensor info. Data type supported: same as @p src
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuFloor::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FLOOR_H */
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
new file mode 100644
index 0000000000..cafb3484b6
--- /dev/null
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -0,0 +1,496 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuFullyConnected.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
+#include "src/cpu/operators/CpuFlatten.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+// Get min, max bound of a quantized asymmetric dst tensor, with the effect of fused activation
+std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
+{
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ const UniformQuantizationInfo q_unif = q_info.uniform();
+
+ if(act_info.enabled())
+ {
+ switch(act_info.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ type_min = PixelValue(q_unif.offset);
+ break;
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ type_min = PixelValue(q_unif.offset);
+ type_max = PixelValue(act_info.a(), data_type, q_info);
+ break;
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ type_min = PixelValue(act_info.b(), data_type, q_info);
+ type_max = PixelValue(act_info.a(), data_type, q_info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Activation function not supported.");
+ break;
+ }
+ }
+
+ return std::make_pair(type_min, type_max);
+}
+
+Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+{
+ const auto data_type = src->data_type();
+ const QuantizationInfo oq_info = dst->quantization_info();
+ const UniformQuantizationInfo iq_unif = src->quantization_info().uniform();
+ const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform();
+ const UniformQuantizationInfo oq_unif = oq_info.uniform();
+
+ float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
+ int32_t output_multiplier;
+ int32_t output_shift;
+
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
+
+ gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
+ gemmlowp_output_stage_info.gemmlowp_shift = output_shift;
+ gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset;
+ gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get<int32_t>();
+ gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get<int32_t>();
+
+ return Status{};
+}
+
+Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act)
+{
+ if(is_data_type_quantized_asymmetric(src->data_type()))
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate src and weights offset
+ const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
+ const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+
+ GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
+ ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
+
+ GEMMInfo gemm_info;
+ gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
+
+ // Validate gemmlowp function
+ TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
+ TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
+ &weights_info,
+ biases,
+ dst,
+ gemm_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
+ }
+
+ return Status{};
+}
+} // namespace
+
+CpuFullyConnected::CpuFullyConnected()
+ : _flatten(nullptr),
+ _convert_weights(nullptr),
+ _transpose_weights(nullptr),
+ _mm_gemm(nullptr),
+ _mm_gemmlowp(nullptr),
+ _flattened_src(),
+ _converted_weights(),
+ _reshaped_weights(),
+ _trans_weights(),
+ _trans_weights_idx(AuxTensorIdx::Count),
+ _aux_mem(Count),
+ _needs_weights_conversion(false),
+ _needs_weights_reshape(false),
+ _is_fc_after_conv(false),
+ _is_quantized_asymmetric(false),
+ _is_prepared(false)
+
+{
+}
+
+CpuFullyConnected::~CpuFullyConnected() = default;
+
+void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+{
+ if(_is_quantized_asymmetric)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate src and weights offset
+ const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
+ const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+
+ TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
+ TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+
+ // Configure gemmlowp function and output stage for asymmetric quantized types
+ GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
+ const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+ ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
+
+ GEMMInfo gemm_info;
+ gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
+ gemm_info.set_activation_info(act);
+ _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
+ _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info);
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */);
+ gemm_info.set_activation_info(act);
+ _mm_gemm = std::make_unique<CpuGemm>();
+ _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info);
+ }
+}
+
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+{
+ ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+ // If the fully connected layer is called after a convolution layer, the src tensor must be linearized
+
+ // Initialize output tensor for flatten
+ auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src)));
+
+ _flatten = std::make_unique<CpuFlatten>();
+ _flatten->configure(src, &_flattened_src);
+
+ // Configure matrix multiply kernel
+ configure_mm(&_flattened_src, weights, biases, dst, act);
+}
+
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+{
+ ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
+
+ // Configure matrix multiply kernel
+ configure_mm(src, weights, biases, dst, act);
+}
+
+void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
+ weights,
+ biases != nullptr ? biases : nullptr,
+ dst,
+ fc_info));
+
+ _needs_weights_conversion = false;
+ _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+ _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights;
+ _is_fc_after_conv = true;
+ _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
+ _is_prepared = false;
+ _trans_weights_idx = AuxTensorIdx::Count;
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = dst->dimension(1) > 1;
+ if(is_batched_fc_layer)
+ {
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
+ src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ _is_fc_after_conv = src->num_dimensions() > 1;
+ }
+
+ // Reshape weights if needed
+ if(_needs_weights_reshape)
+ {
+ // Reshape the weights
+ _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
+ _transpose_weights->configure(weights, &_reshaped_weights);
+ weights_to_use = &_reshaped_weights;
+ _trans_weights_idx = AuxTensorIdx::TransposedWeights;
+ }
+
+ // Convert weights if needed
+ if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Convert weights
+ _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
+ _convert_weights->configure(weights_to_use,
+ &_converted_weights,
+ src->tensor_shape(),
+ fc_info.weights_trained_layout);
+
+ weights_to_use = &_converted_weights;
+ _needs_weights_conversion = true;
+ _trans_weights_idx = AuxTensorIdx::ConvertedWeights;
+ }
+
+ if(_is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
+ }
+
+ // Retain the tensorinfo with the weights to use
+ if(_needs_weights_reshape || _needs_weights_conversion)
+ {
+ _trans_weights = *weights_to_use;
+ }
+
+ // Set auxiliary memory requirements
+ auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
+ for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+ {
+ _aux_mem[i] = gemm_mem_req[i];
+ }
+
+ if(_aux_mem[Pretranspose].size > 0)
+ {
+ // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size());
+ _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size());
+ }
+ else
+ {
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size());
+ _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size());
+ }
+ _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+}
+
+Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info)
+{
+ ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
+ && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!fc_info.constant_weights, "Non-constant weights are currently not supported");
+
+ bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
+ bool is_fc_after_conv = true;
+
+ const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+ const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+
+ // With the Fully Connected layer we can have 4 different cases:
+ // 1) Convolution layer -> Fully Connected layer without batches
+ // 2) Fully Connected layer -> Fully Connected layer without batches
+ // 3) Convolution layer -> Fully Connected layer with batches
+ // 4) Fully Connected layer -> Fully Connected layer with batches
+
+ const ITensorInfo *src_to_use = src;
+ const ITensorInfo *weights_to_use = weights;
+
+ // Check if we have a fully connected layer with batches
+ const bool is_batched_fc_layer = dst->dimension(1) > 1;
+
+ if(is_batched_fc_layer)
+ {
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
+ src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
+ }
+ else
+ {
+ is_fc_after_conv = src->num_dimensions() > 1;
+ }
+
+ if(!weights_reshaped)
+ {
+ // Validate reshape weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
+ weights_to_use = &reshaped_weights;
+ }
+
+ if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ {
+ // Validate convert weights kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
+ &converted_weights,
+ src->tensor_shape(),
+ fc_info.weights_trained_layout));
+ weights_to_use = &converted_weights;
+ }
+
+ if(is_fc_after_conv)
+ {
+ // Fully Connected layer after a Convolution Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+ // Validate flatten kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
+ src_to_use = &flatten_src;
+ }
+ else
+ {
+ // Fully Connected layer after a Fully Connected Layer without batches
+ ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
+ }
+ // Validate matrix multiply kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info));
+
+ return Status{};
+}
+
+void CpuFullyConnected::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto src = tensors.get_const_tensor(ACL_SRC_0);
+
+ CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+ CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
+
+ // Linearize src if it comes from a convolutional layer
+ if(_is_fc_after_conv)
+ {
+ ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+ _flatten->run(flatten_pack);
+ }
+
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+ if(_needs_weights_reshape || _needs_weights_conversion)
+ {
+ gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
+ }
+
+ // Run matrix multiply
+ if(_is_quantized_asymmetric)
+ {
+ _mm_gemmlowp->run(gemm_pack);
+ }
+ else
+ {
+ _mm_gemm->run(gemm_pack);
+ }
+}
+
+void CpuFullyConnected::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto weights = tensors.get_const_tensor(ACL_SRC_1);
+
+ CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
+ CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
+
+ // Pointer to current weights
+ const ITensor *cur_weights = weights;
+
+ // Reshape of the weights (happens only once)
+ if(_needs_weights_reshape)
+ {
+ // Run reshape weights kernel and mark weights as unused
+ ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
+ NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
+
+ cur_weights->mark_as_unused();
+ cur_weights = reshaped_weights.get();
+ }
+
+ // Convert weights if needed (happens only once)
+ if(_needs_weights_conversion)
+ {
+ ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+ _convert_weights->run(convert_pack);
+
+ cur_weights->mark_as_unused();
+ cur_weights = converted_weights.get();
+ }
+
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
+
+ // Prepare GEMM prepare and release unused weights
+ if(!_is_quantized_asymmetric)
+ {
+ _mm_gemm->prepare(gemm_pack);
+ }
+ else
+ {
+ _mm_gemmlowp->prepare(gemm_pack);
+ }
+
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements CpuFullyConnected::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
new file mode 100644
index 0000000000..304ea3c62b
--- /dev/null
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
+#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
+
+#include "src/cpu/ICpuOperator.h"
+
+#include "arm_compute/core/TensorInfo.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+// Forward declarations
+class CpuConvertFullyConnectedWeights;
+class CpuFlatten;
+class CpuGemm;
+class CpuGemmLowpMatrixMultiplyCore;
+namespace kernels
+{
+class CpuTransposeKernel;
+} // namespace kernels
+/** Basic function to compute a Fully Connected layer. This function calls the following kernels:
+ * -# @ref kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
+ * -# @ref kernels::CpuTransposeKernel (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ * -# @ref CpuGemm or @ref CpuGemmLowpMatrixMultiplyCore (if quantized asymmetric)
+ * -# @ref kernels::CpuGemmMatrixAdditionKernel or @ref CpuGemmLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
+ *
+ * @note The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class CpuFullyConnected : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuFullyConnected();
+ /** Destructor */
+ ~CpuFullyConnected();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
+ * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+ * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+ * Data type supported: Same as @p src.
+ * @param[in] biases Bias tensor info. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+ * @param[out] dst Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
+ * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+ * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+ * Data type supported: Same as @p src.
+ * @param[in] fc_info (Optional) Fully connected layer additional info
+ */
+ void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
+ *
+ * Similar to @ref CpuFullyConnected
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+ //Inherited methods override
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+ void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+ void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+
+ enum AuxTensorIdx
+ {
+ AsmGemmWorkspace = 0,
+ Pretranspose,
+ GemmTemp1, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
+ GemmTemp2, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
+ GemmTemp3, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
+ GemmTemp4, // CpuGemmLowpMatrixMultiplyCore only
+ GemmTemp5, // CpuGemmLowpMatrixMultiplyCore only
+ GemmTemp6, // CpuGemmLowpMatrixMultiplyCore only
+ GemmTemp7, // CpuGemmLowpMatrixMultiplyCore only
+ TransposedWeights,
+ ConvertedWeights,
+ FlattenedSrc,
+ Count
+ };
+
+ std::unique_ptr<CpuFlatten> _flatten;
+ std::unique_ptr<CpuConvertFullyConnectedWeights> _convert_weights;
+ std::unique_ptr<kernels::CpuTransposeKernel> _transpose_weights;
+ std::unique_ptr<CpuGemm> _mm_gemm;
+ std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp;
+
+ TensorInfo _flattened_src;
+ TensorInfo _converted_weights;
+ TensorInfo _reshaped_weights;
+ TensorInfo _trans_weights;
+ AuxTensorIdx _trans_weights_idx;
+
+ experimental::MemoryRequirements _aux_mem;
+
+ bool _needs_weights_conversion;
+ bool _needs_weights_reshape;
+ bool _is_fc_after_conv;
+ bool _is_quantized_asymmetric;
+ bool _is_prepared;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_FULLY_CONNECTED_H */
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
new file mode 100644
index 0000000000..f7416315e9
--- /dev/null
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemm.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+ cpu::AsmGemmInfo asm_info;
+ asm_info.method = cpu::AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+ asm_info.fast_mode = info.fast_math();
+
+ return asm_info;
+}
+} // namespace
+
+void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
+
+ const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
+ bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info));
+
+ // Check if we need to reshape the matrix B only on the first run
+ _is_prepared = false;
+ _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _run_vector_matrix_multiplication = a->dimension(1) < 2;
+ _run_alpha_scale = alpha != 1.f;
+ _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run();
+ _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
+ _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised
+ && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+
+ if(run_optimised)
+ {
+ const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
+ _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
+ _asm_glue->configure(a, b, c_to_use, d, asm_info);
+ ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
+
+ auto asm_mem_req = _asm_glue->workspace();
+ _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
+ _aux_mem[Pretraspose] = asm_mem_req[Pretraspose];
+
+ // Scale product by alpha
+ if(_run_alpha_scale)
+ {
+ _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
+ _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+ }
+ }
+ else
+ {
+ // Pick output tensor in case bias addition should be performed
+ ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
+
+ _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
+
+ // Select between GEMV and GEMM
+ if(_run_vector_matrix_multiplication)
+ {
+ // Configure the matrix multiply kernel
+ _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
+ }
+ else
+ {
+ const int m = a->dimension(1);
+ const int n = b->dimension(0);
+ const int k = a->dimension(0);
+
+ // Configure interleave kernel
+ _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
+ _interleave_kernel->configure(a, &_tmp_a);
+ _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+
+ // Configure transpose kernel
+ _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
+ _transpose_kernel->configure(b, &_tmp_b);
+ _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+
+ // Configure matrix multiplication kernel
+ _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+ }
+
+ if(_run_bias_addition)
+ {
+ _add_bias = std::make_unique<cpu::CpuAdd>();
+ _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
+ _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+ }
+ }
+
+ // Configure matrix addition kernel
+ if(_run_addition)
+ {
+ _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
+ _ma_kernel->configure(c, d, beta);
+ }
+
+ // Configure activation
+ if(_run_activation)
+ {
+ _activation_func = std::make_unique<cpu::CpuActivation>();
+ _activation_func->configure(d, nullptr, gemm_info.activation_info());
+ }
+}
+
+Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha);
+ const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
+
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ if(a->data_type() != DataType::BFLOAT16)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
+ }
+
+ if(c != nullptr && !is_c_bias)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+ }
+
+ if(d->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != d->dimension(0));
+ if(gemm_info.depth_output_gemm3d() != 0)
+ {
+ if(gemm_info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1) * d->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
+ }
+ }
+
+ // Check if we need to run the optimized assembly kernel
+ cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info));
+
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
+
+ // Check if the first input tensor is a vector.
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ // Check if we need to reshape the matrix A and matrix B
+ const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
+
+ // Arguments used by GEMMReshapeInfo
+ // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
+ // in order to know how the matrices have been reshaped
+ const int m = a->dimension(1);
+ const int n = b->dimension(0);
+ const int k = a->dimension(0);
+ int mult_transpose1xW_width = 1;
+ int mult_interleave4x4_height = 1;
+
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo tmp_output_info = *d->clone();
+
+ if(run_interleave_transpose)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
+
+ // Validate transpose kernel
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+ }
+
+ // Validate matrix multiply
+ auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+
+ if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
+ }
+ }
+
+ // Validate matrix addition kernel
+ if(beta != 0 && c != nullptr && !is_c_bias)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
+ }
+
+ // Validate activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ if(activation.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
+ }
+
+ return Status{};
+}
+
+void CpuGemm::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto a = tensors.get_const_tensor(ACL_SRC_0);
+ auto b = tensors.get_const_tensor(ACL_SRC_1);
+ auto c = tensors.get_const_tensor(ACL_SRC_2);
+ auto d = tensors.get_tensor(ACL_DST);
+
+ if(_asm_glue->is_configured())
+ {
+ // Pass c to asm dispatch only if it's the bias tensor
+ ITensorPack asm_pack = tensors;
+ asm_pack.add_const_tensor(ACL_SRC_2, (_reshape_b_only_on_first_run) ? c : nullptr);
+ _asm_glue->run(asm_pack);
+ if(_run_alpha_scale)
+ {
+ ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ _alpha_scale_func->run(pack);
+ }
+ }
+ else
+ {
+ CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
+ CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
+ CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
+
+ ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
+ if(!_run_vector_matrix_multiplication)
+ {
+ // Run interleave kernel
+ ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
+ NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ // Run transpose kernel
+ ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
+ NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+ }
+
+ // Use reshaped matrices
+ mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
+ mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
+ }
+
+ NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
+
+ // Run bias addition kernel
+ if(_run_bias_addition)
+ {
+ ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
+ _add_bias->run(pack);
+ }
+ }
+
+ // Run matrix addition kernel
+ if(_run_addition)
+ {
+ ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
+ NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
+ }
+
+ // Run activation function
+ if(_run_activation)
+ {
+ ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ _activation_func->run(pack);
+ }
+}
+
+void CpuGemm::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ if(_asm_glue->is_configured())
+ {
+ _asm_glue->prepare(tensors);
+ }
+ else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+ {
+ const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
+
+ CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
+ ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
+ NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+ }
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements CpuGemm::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
new file mode 100644
index 0000000000..334ab6c647
--- /dev/null
+++ b/src/cpu/operators/CpuGemm.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_H
+#define ARM_COMPUTE_CPU_GEMM_H
+
+#include "src/cpu/ICpuOperator.h"
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
+#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuAdd.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to execute GEMM. This function calls the following kernels:
+ *
+ * If optimized assembly is available:
+ * -# @ref cpu::CpuGemmAssemblyDispatch
+ * -# @ref cpu::CpuActivation (if alpha != 1.0)
+ * Else:
+ * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix)
+ * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix)
+ * -# @ref cpu::kernels::CpuGemmMatrixMultiplyKernel
+ * In both cases:
+ * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
+ * Else:
+ * -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place)
+ *
+ * -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo)
+ */
+class CpuGemm : public ICpuOperator
+{
+public:
+ /** Default constructor */
+ CpuGemm() = default;
+ /** Default destructor */
+ ~CpuGemm() = default;
+ /** Configure operator for a given list of arguments
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:------------|:-----------|:---------|:--------------|
+ * |F32 |F32 |F32 |F32 |
+ * |F16 |F16 |F16 |F16 |
+ * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 |
+ *
+ * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+ * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
+ *
+ * @param[in] a First input tensor info (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
+ * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a
+ * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
+ * @param[out] d Output tensor info. Data type supported: same as @p a
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of matrix C
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+ * if the reshape of matrix B should happen only for the first run
+ */
+ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+ float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
+ *
+ * Similar to @ref CpuGemm::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
+ float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ AsmGemmWorkspace = 0,
+ Pretraspose,
+ InterleavedLHS,
+ TransposedRHS,
+ TempResult,
+ Count
+ };
+
+ std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{ nullptr };
+ std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{ nullptr };
+ std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr };
+ std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
+ std::unique_ptr<CpuActivation> _alpha_scale_func{ nullptr };
+ std::unique_ptr<CpuAdd> _add_bias{ nullptr };
+ std::unique_ptr<CpuActivation> _activation_func{ nullptr };
+
+ TensorInfo _tmp_a{};
+ TensorInfo _tmp_b{};
+ TensorInfo _tmp_d{};
+
+ bool _run_vector_matrix_multiplication{ false };
+ bool _run_alpha_scale{ false };
+ bool _run_addition{ false };
+ bool _run_bias_addition{ false };
+ bool _run_activation{ false };
+ bool _reshape_b_only_on_first_run{ false };
+ bool _is_prepared{ false };
+
+ experimental::MemoryRequirements _aux_mem{ Count };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_GEMM_H */
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
new file mode 100644
index 0000000000..5010792a28
--- /dev/null
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmConv2d.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuCol2ImKernel.h"
+#include "src/cpu/kernels/CpuIm2ColKernel.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "src/cpu/kernels/CpuWeightsReshapeKernel.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+#include <set>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuGemmConv2d::CpuGemmConv2d()
+ : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
+ _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+{
+}
+CpuGemmConv2d::~CpuGemmConv2d() = default;
+
+void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
+ bool enable_fast_math, int gemm_3d_depth)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col));
+
+ // Create GEMMInfo structure
+ const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info);
+
+ // Supported activations in GEMM
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(_is_quantized)
+ {
+ TensorInfo tmp_src{ *src };
+ TensorInfo tmp_weights{ *weights };
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo iqinfo = src->quantization_info();
+ const QuantizationInfo wqinfo = weights->quantization_info();
+ const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+ const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+ const DataType data_type = src->data_type();
+
+ tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+ if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+ {
+ const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+ tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+ }
+
+ // Merge activation with output stage
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ int32_t min_activation = type_min.get<int32_t>();
+ int32_t max_activation = type_max.get<int32_t>();
+
+ if(supported_acts.count(act_info.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL);
+ quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
+
+ _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info));
+
+ auto mm_mem_req = _mm_gemmlowp->workspace();
+ for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ {
+ _aux_mem[cont] = mm_mem_req[cont];
+ }
+ }
+ else
+ {
+ // Configure matrix multiply function
+ _mm_gemm = std::make_unique<CpuGemm>();
+ _mm_gemm->configure(src, weights, biases, dst, 1.0f, 0.0f, gemm_info);
+ auto mm_mem_req = _mm_gemm->workspace();
+ for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ {
+ _aux_mem[cont] = mm_mem_req[cont];
+ }
+ }
+}
+
+Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col)
+{
+ const DataType data_type = src->data_type();
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool is_activation_enabled = act_info.enabled();
+
+ // Create GEMMInfo structure
+ const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info);
+
+ if(is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo &iqinfo = src->quantization_info();
+ const QuantizationInfo &wqinfo = weights->quantization_info();
+ const QuantizationInfo &oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+ // Merge activation with output stage
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ int32_t min_activation = type_min.get<int32_t>();
+ int32_t max_activation = type_max.get<int32_t>();
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
+
+ // Perform validation step on GEMMLowp
+ std::unique_ptr<ITensorInfo> input_qa = src->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
+ return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info,
+ false, enable_fast_math, false, act_info));
+ }
+ else
+ {
+ // Perform validation step on Matrix multiply function
+ return CpuGemm::validate(src, weights, nullptr, dst, 1.0f, 0.0f, gemm_info);
+ }
+}
+
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+{
+ const DataType data_type = input_info->data_type();
+ const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
+ const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
+
+ // Set dummy tensor shapes for the validation
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+ const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+
+ return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
+}
+
+void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_UNUSED(num_groups, weights_info);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
+ weights,
+ biases,
+ dst,
+ conv_info,
+ weights_info,
+ dilation,
+ act_info,
+ enable_fast_math,
+ num_groups));
+
+ const DataType data_type = src->data_type();
+ const DataLayout data_layout = src->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
+
+ _is_prepared = weights_info.retain_internal_weights();
+ _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+ _data_layout = data_layout;
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+ const ITensorInfo *gemm_input_to_use = src;
+ ITensorInfo *gemm_output_to_use = dst;
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv_info,
+ dilation);
+ ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+ "Output shape does not match the expected one");
+
+ // Check if GEMM3D is supported
+ if(data_layout == DataLayout::NHWC)
+ {
+ _skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!_skip_col2im)
+ {
+ _skip_im2col = false;
+ }
+ }
+ else
+ {
+ _skip_col2im = false;
+ }
+
+ // Get parameters from conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+
+ unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+
+ // _weights_reshaped will be auto configured in the kernel.
+ // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm
+ _weights_reshape_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
+ _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped);
+ _weights_reshaped.set_quantization_info(weights->quantization_info());
+
+ // Create tensor to store im2col reshaped inputs
+ if(!_skip_im2col)
+ {
+ // Configure
+ _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
+ _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
+
+ // Update GEMM input
+ gemm_input_to_use = &_im2col_output;
+ }
+
+ // Create temporary GEMM output tensor in case we cannot skip col2im
+ const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+ if(!_skip_col2im)
+ {
+ TensorShape shape_gemm;
+
+ // Calculate GEMM output shape
+ shape_gemm = _im2col_output.tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+
+ _gemm_output = TensorInfo(shape_gemm, 1, output_data_type);
+ _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+ _gemm_output_3d = TensorInfo(_gemm_output);
+
+ // Update GEMM output
+ gemm_output_to_use = &_gemm_output;
+ }
+ else
+ {
+ _gemm_output_3d = TensorInfo(*dst);
+ _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true);
+ _gemm_output = TensorInfo(_gemm_output_3d);
+
+ // Update GEMM output
+ gemm_output_to_use = &_gemm_output_3d;
+ }
+
+ // Configure GEMM
+ // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth);
+
+ if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+ {
+ // Configure col2im
+ _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
+ _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h));
+ }
+ else
+ {
+ // Configure reshape layer
+ _reshape_kernel = std::make_unique<kernels::CpuReshapeKernel>();
+ _reshape_kernel->configure(gemm_output_to_use, dst);
+ }
+
+ // Check if GEMM transforms weights
+ // Modernise through COMPMID-4535
+ bool gemm_trans_wei = _aux_mem[1].size > 0; // Asm Pretranspose
+ gemm_trans_wei = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei; // Tranpose RHS
+ gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
+
+ // Check lifetime
+ _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+ _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
+ _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+}
+
+Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
+
+ const DataLayout data_layout = src->data_layout();
+ const DataType data_type = src->data_type();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
+
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo tmp_info{};
+ TensorInfo weights_reshaped_info{};
+ const ITensorInfo *gemm_input_to_use = src;
+ const ITensorInfo *gemm_output_to_use = dst;
+ const ITensorInfo *weights_to_use = weights;
+
+ const bool append_bias = false;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool is_bf16 = data_type == DataType::BFLOAT16;
+ bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv_info,
+ dilation);
+
+ // Check if GEMM3D is supported
+ bool skip_col2im = false;
+ if(data_layout == DataLayout::NHWC)
+ {
+ skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!skip_col2im)
+ {
+ skip_im2col = false;
+ }
+ }
+
+ if(skip_col2im)
+ {
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!bool(validate_gemm3d(src, weights, act_info, conv_h, skip_im2col)))
+ {
+ skip_im2col = false;
+ skip_col2im = false;
+ }
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else if(is_bf16)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+ unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
+ weights_reshaped_info.set_quantization_info(weights->quantization_info());
+ weights_to_use = &weights_reshaped_info;
+
+ if(!skip_im2col)
+ {
+ // Create tensor info for im2col reshaped inputs
+ // For CPU, the batch size is on the fourth dimension
+ TensorShape shape_im2col = src->tensor_shape();
+ shape_im2col.set(0, mat_weights_rows);
+ shape_im2col.set(1, conv_w * conv_h);
+ shape_im2col.set(2, 1);
+
+ im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
+ im2col_reshaped_info.set_quantization_info(src->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+ gemm_input_to_use = &im2col_reshaped_info;
+ }
+
+ // Create temporary GEMM output tensor in case we cannot skip col2im
+ const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+ if(!skip_col2im)
+ {
+ TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+ info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
+ }
+ else
+ {
+ info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type);
+ }
+ info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+ gemm_output_to_use = &info_gemm;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col));
+
+ // Validate Col2Im/ReshapeLayer
+ if(!skip_col2im && (data_layout == DataLayout::NCHW))
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+ }
+
+ return Status{};
+}
+
+void CpuGemmConv2d::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto src = tensors.get_const_tensor(ACL_SRC_0);
+ auto dst = tensors.get_tensor(ACL_DST);
+ auto gemm_input_to_use = src;
+
+ CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+ CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+ CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
+
+ bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
+ if(!_skip_im2col)
+ {
+ // Run input reshaping
+ unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, src },
+ { TensorType::ACL_DST, im2col_output.get() }
+ };
+ NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
+ gemm_input_to_use = im2col_output.get();
+ }
+
+ // Handle the case where output has top/bottom padding
+ const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst;
+ Tensor gemm3d;
+ _gemm_output_3d.extend_padding(out_to_use->info()->padding());
+ gemm3d.allocator()->soft_init(_gemm_output_3d);
+ gemm3d.allocator()->import_memory(out_to_use->buffer());
+ auto gemm_output_to_use = gemm_output.get();
+
+ if(_skip_im2col)
+ {
+ gemm_output_to_use = &gemm3d;
+ }
+ if(_skip_col2im && !out_has_padding)
+ {
+ gemm_output_to_use = dst;
+ }
+
+ // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
+ ITensorPack pack_mm = tensors;
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+ pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+ pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+ if(_is_quantized)
+ {
+ // Run gemmlowp
+ _mm_gemmlowp->run(pack_mm);
+ }
+ else
+ {
+ // Run gemm
+ _mm_gemm->run(pack_mm);
+ }
+
+ // Reshape output matrix
+ if(!_skip_col2im)
+ {
+ if(_data_layout == DataLayout::NCHW)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output.get() },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
+ }
+ else
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output_to_use },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+ }
+ }
+ else if(out_has_padding)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output_to_use },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+ }
+}
+
+void CpuGemmConv2d::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ // Run weights reshaping and mark original weights tensor as unused
+ CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, weights },
+ { TensorType::ACL_DST, weights_reshaped.get() }
+ };
+ NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
+ weights->mark_as_unused();
+
+ // Prepare GEMM
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+ _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack);
+
+ _is_prepared = true;
+ }
+}
+experimental::MemoryRequirements CpuGemmConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
new file mode 100644
index 0000000000..e63e7169b0
--- /dev/null
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_CONV2D_H
+#define ARM_COMPUTE_CPU_GEMM_CONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuGemm;
+class CpuGemmLowpMatrixMultiplyCore;
+class CpuGemmLowpOutputStage;
+namespace kernels
+{
+class CpuWeightsReshapeKernel;
+class CpuIm2ColKernel;
+class CpuCol2ImKernel;
+class CpuReshapeKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
+ *
+ * -# @ref cpu::kernels::CpuIm2ColKernel
+ * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32)
+ * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
+ * -# @ref kernels::CpuWeightsReshapeKernel
+ *
+ */
+class CpuGemmConv2d : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuGemmConv2d();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuGemmConv2d(const CpuGemmConv2d &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ CpuGemmConv2d(CpuGemmConv2d &&) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuGemmConv2d &operator=(const CpuGemmConv2d &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ CpuGemmConv2d &operator=(CpuGemmConv2d &&) = delete;
+ /** Destructor */
+ ~CpuGemmConv2d();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:--------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+ */
+ void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmConvolution::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false, unsigned int num_groups = 1);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ /** Configures the appropriate matrix multiply routine
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Output tensor info. Data types supported: Same as @p input,
+ * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+ */
+ void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false, int gemm_3d_depth = 1);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[in] dst Output tensor info. Data types supported: Same as @p input,
+ * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+ * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+ *
+ * @return a status
+ */
+ static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false);
+ /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] gemm_3d_depth Depth of GEMM 3D
+ * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+ *
+ * @return a status
+ */
+ static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+
+ enum AuxTensorIdx
+ {
+ // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors
+ Im2ColOutput = 9,
+ WeightsReshaped,
+ GemmOutput,
+ Count
+ };
+
+ std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_kernel;
+ std::unique_ptr<cpu::kernels::CpuIm2ColKernel> _im2col_kernel;
+ std::unique_ptr<CpuGemm> _mm_gemm;
+ std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp;
+ std::unique_ptr<kernels::CpuCol2ImKernel> _col2im_kernel;
+ std::unique_ptr<kernels::CpuReshapeKernel> _reshape_kernel;
+
+ TensorInfo _im2col_output;
+ TensorInfo _weights_reshaped;
+ TensorInfo _gemm_output;
+ TensorInfo _gemm_output_3d;
+
+ DataLayout _data_layout;
+
+ bool _skip_im2col;
+ bool _skip_col2im;
+ bool _is_quantized;
+ bool _is_prepared;
+
+ experimental::MemoryRequirements _aux_mem{ Count };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMM_CONV2D_H */
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
new file mode 100644
index 0000000000..2e17a21462
--- /dev/null
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+#include "support/Cast.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+
+namespace
+{
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+{
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo iqinfo = src->quantization_info();
+ const QuantizationInfo wqinfo = weights->quantization_info();
+ const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+ const DataType data_type = src->data_type();
+ // Merge activation with output stage
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ int32_t min_activation = type_min.get<int32_t>();
+ int32_t max_activation = type_max.get<int32_t>();
+ if(supported_acts.count(act.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
+ }
+ GEMMLowpOutputStageInfo os_info;
+ os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ os_info.gemmlowp_offset = uoqinfo.offset;
+ os_info.gemmlowp_min_bound = min_activation;
+ os_info.gemmlowp_max_bound = max_activation;
+ os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+ quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
+ return os_info;
+}
+cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+{
+ cpu::AsmGemmInfo asm_info;
+ asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
+ asm_info.ps_info = info.conv_info;
+ asm_info.activation_info = info.act_info;
+ asm_info.depth_output_gemm3d = true;
+ asm_info.reinterpret_input_as_3d = true;
+ asm_info.padding_top = info.conv_info.pad_top();
+ asm_info.padding_left = info.conv_info.pad_left();
+ asm_info.padding_value = 0.f;
+ asm_info.negated_offsets = false;
+ asm_info.fast_mode = info.enable_fast_math;
+ return asm_info;
+}
+} // namespace
+
+CpuGemmDirectConv2d::CpuGemmDirectConv2d()
+ : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>()),
+ _activation_func(std::make_unique<CpuActivation>()),
+ _weights_permute_func(std::make_unique<CpuPermute>()),
+ _aux_mem(AuxTensorIdx::Count),
+ _perm_weights(),
+ _run_activation(false),
+ _is_prepared(false)
+{
+}
+
+CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
+
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
+ weights,
+ biases != nullptr ? biases : nullptr,
+ dst,
+ info));
+ _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
+ _is_prepared = false;
+
+ _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
+
+ // Configure assembly dispatch
+ cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+ if(is_data_type_quantized(src->data_type()))
+ {
+ asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
+ }
+ _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
+
+ // Configure activation
+ if(_run_activation)
+ {
+ _activation_func->configure(dst, nullptr, info.act_info);
+ }
+
+ // Add auxiliary memory requirements of the assembly dispatch
+ auto asm_mem_req = _gemm_asm_func->workspace();
+ _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
+ _aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
+
+ if(_aux_mem[Pretranspose].size > 0)
+ {
+ // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+ _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+ }
+ else
+ {
+ _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+ }
+}
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
+ const DataType data_type = src->data_type();
+ const TensorShape i_shape = src->tensor_shape();
+ const TensorShape w_shape = weights->tensor_shape();
+ ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+ // Validate biases
+ if(biases != nullptr)
+ {
+ if(is_data_type_quantized_asymmetric(data_type))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else if(data_type == DataType::BFLOAT16)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info));
+ return Status{};
+}
+void CpuGemmDirectConv2d::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ _gemm_asm_func->run(tensors);
+ if(_run_activation)
+ {
+ _activation_func->run(tensors);
+ }
+}
+
+void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
+
+ CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
+ ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+ _weights_permute_func->run(permute_tensors);
+
+ tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
+ // Call prepare of assembly dispatch
+ _gemm_asm_func->prepare(tensors);
+
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
new file mode 100644
index 0000000000..e55a461f36
--- /dev/null
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
+#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+struct Conv2dInfo;
+namespace cpu
+{
+class CpuGemmDirectConv2d : public ICpuOperator
+{
+public:
+ CpuGemmDirectConv2d();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d);
+ ~CpuGemmDirectConv2d();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:--------------|:--------------|:--------------|
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 |
+ *
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
+ */
+ void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
+ *
+ * Similar to CpuGemmDirectConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ AsmGemmWorkspace = 0,
+ Pretranspose,
+ PermutedWeights,
+ Count
+ };
+
+ std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func;
+ std::unique_ptr<CpuActivation> _activation_func;
+ std::unique_ptr<CpuPermute> _weights_permute_func;
+ experimental::MemoryRequirements _aux_mem;
+ TensorInfo _perm_weights;
+ bool _run_activation;
+ bool _is_prepared;
+};
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..2074a89307
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+
+#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
+#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+ cpu::AsmGemmInfo asm_info;
+ asm_info.method = cpu::AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+ asm_info.output_stage = info.gemmlowp_output_stage();
+ asm_info.fast_mode = info.fast_math();
+
+ return asm_info;
+}
+} // namespace
+
+CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
+ : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
+ _mm_kernel(),
+ _mtx_a_reshape_kernel(),
+ _mtx_b_reshape_kernel(),
+ _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(),
+ _offset_contribution_kernel(),
+ _offset_contribution_output_stage_kernel(),
+ _activation_func(),
+ _convert_to_signed_asymm(),
+ _convert_from_signed_asymm(),
+ _vector_sum_col(),
+ _vector_sum_row(),
+ _tmp_a(),
+ _tmp_b(),
+ _mm_result_s32(),
+ _signed_a(),
+ _signed_output(),
+ _a_offset(0),
+ _b_offset(0),
+ _run_vector_matrix_multiplication(false),
+ _assembly_path(false),
+ _fused_assembly_path(false),
+ _reshape_b_only_on_first_run(false),
+ _is_prepared(false),
+ _fuse_output_stage(false),
+ _run_activation(false),
+ _flip_signedness(false),
+ _gemm_info(),
+ _aux_mem(Count)
+{
+}
+CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
+
+void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
+
+ const ITensorInfo *matrix_a = a;
+ const ITensorInfo *matrix_b = b;
+ GEMMInfo info = gemm_info;
+
+ // Set internal variables
+ _a_offset = a->quantization_info().uniform().offset;
+ _b_offset = b->quantization_info().uniform().offset;
+ _run_vector_matrix_multiplication = a->dimension(1) < 2;
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _fused_assembly_path = false;
+ _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
+ _gemm_info = gemm_info;
+
+ _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
+
+ const ITensorInfo *a_to_use = a;
+
+ // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+ if(_flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
+ _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
+ a_to_use = &_signed_a;
+ _a_offset = _signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+ _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+
+ // Update matrix a
+ matrix_a = &_signed_a;
+ }
+
+ // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+ if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ {
+ _fuse_output_stage = true;
+ _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
+ }
+
+ // Initialize assembly kernel meta-data
+ const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+#ifdef __aarch64__
+ switch(a->data_type())
+ {
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::U8:
+ case DataType::S8:
+ {
+ if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ auto c_info_to_use = c == nullptr ? nullptr : c;
+ _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
+ _fused_assembly_path = _asm_glue->is_configured();
+ }
+ else
+ {
+ auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
+ _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
+ }
+ _assembly_path = _asm_glue->is_configured();
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
+ }
+#endif /* __aarch64__ */
+ if(!(_assembly_path || _run_vector_matrix_multiplication))
+ {
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
+
+ // Configure interleave kernel
+ _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
+ _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
+
+ // Configure transpose kernel
+ _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
+ _mtx_b_reshape_kernel->configure(b, &_tmp_b);
+ }
+
+ if(!_fused_assembly_path)
+ {
+ // Build reduction info
+ const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
+ _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
+ }
+
+ if(_fuse_output_stage)
+ {
+ // Configure matrix multiply kernel
+ if(!_assembly_path)
+ {
+ _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
+ _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
+ }
+
+ _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+ _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
+ _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : dst,
+ a->dimension(0),
+ _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+ if(_flip_signedness)
+ {
+ _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
+ _convert_from_signed_asymm->configure(&_signed_output, dst);
+ }
+ }
+ else
+ {
+ // Configure matrix multiply kernel
+ if(!_assembly_path)
+ {
+ _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
+ _mm_kernel->configure(matrix_a, matrix_b, dst);
+ }
+ // Configure offset contribution kernel
+ _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
+ _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+ _a_offset, _b_offset);
+ }
+ }
+ // Configure activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+ if(_run_activation)
+ {
+ _activation_func = std::make_unique<CpuActivation>();
+ _activation_func->configure(dst, nullptr, activation);
+ }
+
+ if(_assembly_path)
+ {
+ auto asm_mem_req = _asm_glue->workspace();
+ _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
+ _aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
+ }
+
+ // Request memory for LHS and RHS reshape matrix
+ _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
+ && _reshape_b_only_on_first_run ?
+ MemoryLifetime::Persistent :
+ MemoryLifetime::Temporary,
+ _vector_sum_col.total_size());
+ _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+ _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+ _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+ _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+ _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+}
+
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+ GEMMInfo info = gemm_info;
+ const ITensorInfo *matrix_a_info = a;
+ const ITensorInfo *matrix_b_info = b;
+
+ const ITensorInfo *a_to_use = a;
+
+ TensorInfo tmp_a_info{};
+ TensorInfo tmp_b_info{};
+ TensorInfo mm_result_s32_info{};
+
+ int32_t a_offset = a->quantization_info().uniform().offset;
+ int32_t b_offset = b->quantization_info().uniform().offset;
+
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ if(fuse_output_stage)
+ {
+ auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ }
+
+ // Convert QASYMM8->QASYMM8_SIGNED
+ TensorInfo signed_a{};
+ TensorInfo signed_output{};
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if(flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+ a_to_use = &signed_a;
+ a_offset = signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+
+ // Update matrix a
+ matrix_a_info = &signed_a;
+ }
+
+ // Initialize assembly kernel meta-data
+ const AsmGemmInfo asm_info = init_assembly_metadata(info);
+
+ // Check if we need to run the optimized assembly kernel
+ bool run_optimised = false;
+ bool run_optimised_requantized = false;
+ if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
+ run_optimised_requantized = run_optimised;
+ }
+ else
+ {
+ run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+ }
+
+ if(run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+ if(info.depth_output_gemm3d() != 0)
+ {
+ if(info.reinterpret_input_as_3d())
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+
+ const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+ if(!run_vector_matrix_multiplication)
+ {
+ matrix_a_info = &tmp_a_info;
+ matrix_b_info = &tmp_b_info;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ // Validate interleave kernel
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+ }
+ }
+
+ if(!run_optimised_requantized)
+ {
+ TensorInfo info_vector_sum_col{};
+ TensorInfo info_vector_sum_row{};
+
+ const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
+
+ // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+ if(a_offset != 0)
+ {
+ info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+ // Configure Matrix B reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+ }
+
+ // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(b_offset != 0)
+ {
+ info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+ // Configure matrix A reduction kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+ }
+
+ if(fuse_output_stage)
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ }
+
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ c,
+ flip_signedness ? &signed_output : output,
+ a_offset, b_offset,
+ info.gemmlowp_output_stage()));
+ }
+ else
+ {
+ if(!run_optimised)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ }
+ // Validate offset contribution kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
+ a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
+ }
+ }
+
+ // Validate activation
+ const ActivationLayerInfo &activation = gemm_info.activation_info();
+ if(activation.enabled())
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
+ }
+
+ return Status{};
+}
+
+void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+ auto a_to_use = a;
+ auto matrix_a = a;
+ auto matrix_b = b;
+
+ CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
+ CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
+ CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
+ CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
+ CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
+ CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
+ CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
+
+ // Convert QASYMM8->QASYMM8_SIGNED
+ if(_flip_signedness)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, a },
+ { TensorType::ACL_DST, signed_a.get() }
+ };
+ NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+ a_to_use = signed_a.get();
+ matrix_a = signed_a.get();
+ }
+
+ // Run GEMM
+ if(_asm_glue->is_configured())
+ {
+ ITensorPack asm_glue_tensors = tensors;
+ auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
+ if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ {
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
+ asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
+ }
+ else
+ {
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+ asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
+ asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
+ }
+ _asm_glue->run(asm_glue_tensors);
+ }
+ else
+ {
+ if(!_run_vector_matrix_multiplication)
+ {
+ matrix_a = tmp_a.get();
+ matrix_b = tmp_b.get();
+ // Run interleave kernel
+ ITensorPack pack_a =
+ {
+ { TensorType::ACL_SRC, a_to_use },
+ { TensorType::ACL_DST, tmp_a.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+
+ if(!_reshape_b_only_on_first_run)
+ {
+ ITensorPack pack_b =
+ {
+ { TensorType::ACL_SRC, b },
+ { TensorType::ACL_DST, tmp_b.get() }
+ };
+ // Run transpose kernel
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+ }
+ }
+ ITensorPack pack_mm =
+ {
+ { TensorType::ACL_SRC_0, matrix_a },
+ { TensorType::ACL_SRC_1, matrix_b }
+ };
+ if(_fuse_output_stage)
+ {
+ pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
+ }
+ else
+ {
+ pack_mm.add_tensor(TensorType::ACL_DST, dst);
+ }
+ NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
+ }
+
+ if(!_fused_assembly_path)
+ {
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, a_to_use },
+ { TensorType::ACL_DST, vector_sum_row.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, b },
+ { TensorType::ACL_DST, vector_sum_col.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ }
+
+ if(_fuse_output_stage)
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
+ pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
+ pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
+ pack.add_tensor(TensorType::ACL_SRC_3, c);
+ pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
+
+ // Run offset contribution kernel
+ NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+ }
+ else
+ {
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
+ pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
+ pack.add_tensor(TensorType::ACL_DST, dst);
+
+ // Run offset contribution kernel
+ NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+ }
+ }
+
+ // Convert QASYMM8_SIGNED->QASYMM8
+ if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, signed_output.get() },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+ }
+
+ // Run fused activation unless already run in the fused assembly
+ if(_run_activation)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, dst },
+ { TensorType::ACL_DST, dst }
+ };
+ _activation_func->run(pack);
+ }
+}
+
+void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ // Run assembly reshape
+ if(_asm_glue->is_configured())
+ {
+ _asm_glue->prepare(tensors);
+ }
+ // Run non-assembly reshape
+ else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+ {
+ // Run reshape kernel and mark original weights tensor as unused
+ ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+ CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, original_b },
+ { TensorType::ACL_DST, tmp_b.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+ CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, original_b },
+ { TensorType::ACL_DST, vector_sum_col.get() }
+ };
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ }
+ _is_prepared = true;
+ }
+}
+experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..a7f62aeaa9
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmInterleave4x4Kernel;
+class CpuGemmLowpMatrixMultiplyKernel;
+class CpuGemmLowpOffsetContributionKernel;
+class CpuGemmLowpOffsetContributionOutputStageKernel;
+class CpuGemmLowpMatrixAReductionKernel;
+class CpuGemmLowpMatrixBReductionKernel;
+class CpuGemmTranspose1xWKernel;
+class CpuConvertQuantizedSignednessKernel;
+} // namespace kernels
+class CpuGemmAssemblyDispatch;
+class CpuActivation;
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
+ *
+ * -# @ref kernels::CpuGemmInterleave4x4Kernel
+ * -# @ref kernels::CpuGemmTranspose1xWKernel
+ * -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel
+ * -# @ref kernels::CpuGemmLowpOffsetContributionKernel
+ * -# @ref CpuActivation
+ *
+ * otherwise if the DOT product instruction is available:
+ *
+ * -# @ref kernels::CpuGemmLowpOffsetContributionKernel
+ *
+*/
+class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuGemmLowpMatrixMultiplyCore();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore);
+ /** Destructor */
+ ~CpuGemmLowpMatrixMultiplyCore();
+ /** Initialise the kernel's inputs, output
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:--------|:--------------|
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QASYMM8 |S32 |S32 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 |
+ * |QASYMM8 |QSYMM8 |S32 |S32 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 |
+ * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 |
+ *
+ * @note GEMM_LOWP: low precision GEMM kernel
+ * This kernel performs the following computations:
+ *
+ * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
+ * -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
+ * -# Compute the matrix product of the resulting a * b in int32.
+ *
+ * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
+ *
+ * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+ * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
+ * @param[out] dst Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
+ * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+ * if the reshape of matrix B should be executed only for the first run
+ */
+ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ AsmGemmWorkspace = 0,
+ Pretranspose,
+ VectorSumCol,
+ VectorSumRow,
+ TmpA,
+ TmpB,
+ MMResultS32,
+ SignedA,
+ SignedOutput,
+ Count
+ };
+
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue;
+ std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel> _mm_kernel;
+ std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _mtx_a_reshape_kernel;
+ std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _mtx_b_reshape_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel> _offset_contribution_kernel;
+ std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+ std::unique_ptr<CpuActivation> _activation_func;
+ std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_to_signed_asymm;
+ std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_from_signed_asymm;
+
+ TensorInfo _vector_sum_col;
+ TensorInfo _vector_sum_row;
+ TensorInfo _tmp_a;
+ TensorInfo _tmp_b;
+ TensorInfo _mm_result_s32;
+ TensorInfo _signed_a;
+ TensorInfo _signed_output;
+ int32_t _a_offset;
+ int32_t _b_offset;
+
+ bool _run_vector_matrix_multiplication;
+ bool _assembly_path;
+ bool _fused_assembly_path;
+ bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
+ bool _fuse_output_stage;
+ bool _run_activation;
+ bool _flip_signedness;
+ GEMMInfo _gemm_info;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
new file mode 100644
index 0000000000..ebd3f60280
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
+
+ switch(info.type)
+ {
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+ {
+ switch(info.output_data_type)
+ {
+ case DataType::QASYMM8:
+ {
+ auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ _kernel = std::move(k);
+ break;
+ }
+ case DataType::QASYMM8_SIGNED:
+ {
+ auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ _kernel = std::move(k);
+ break;
+ }
+ case DataType::QSYMM16:
+ {
+ auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ _kernel = std::move(k);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported output data type.");
+ break;
+ }
+ }
+ break;
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+ {
+ switch(info.output_data_type)
+ {
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ {
+ auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel>();
+ k->configure(src, bias, dst, &info);
+ _kernel = std::move(k);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported output data type.");
+ break;
+ }
+ }
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
+ }
+}
+
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+
+ switch(info.type)
+ {
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+ {
+ switch(dst->data_type())
+ {
+ case DataType::QASYMM8:
+ return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ case DataType::QASYMM8_SIGNED:
+ return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ case DataType::QSYMM16:
+ return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
+ }
+ }
+ case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+ {
+ switch(dst->data_type())
+ {
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ return kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
+ }
+ }
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
+ }
+}
+
+void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
+{
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
new file mode 100644
index 0000000000..39394f6b5f
--- /dev/null
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
+#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
+
+#include "arm_compute/core/Types.h"
+#include "src/cpu/ICpuOperator.h"
+
+/** This file contains all available output stages for GEMMLowp.
+ *
+ * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
+ * and processes it to obtain the final ASYMM8 value.
+ *
+ * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to execute GEMMLowpQuantizeDown kernels.
+ *
+ * This function calls the following kernels:
+ *
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+ * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+*/
+class CpuGemmLowpOutputStage : public ICpuOperator
+{
+public:
+ /** Initialise the kernel's inputs, output
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |dst |
+ * |:--------------|:-------------|:-------------|
+ * |S32 |S32 |QASYMM8 |
+ * |S32 |S32 |QASYMM8_SIGNED|
+ * |S32 |S32 |QSYMM16 |
+ *
+ * @param[in] src Input tensor info. Data type supported: S32
+ * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in] info GEMMLowp output stage metadata.
+ */
+ void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmLowpOutputStage::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
new file mode 100644
index 0000000000..06a68d64a8
--- /dev/null
+++ b/src/cpu/operators/CpuMul.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuMul.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuMulKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+ return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
+}
+
+void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ auto k = std::make_unique<kernels::CpuMulKernel>();
+ k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy);
+ _kernel = std::move(k);
+}
+
+void CpuMul::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+
+Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+ return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
+}
+
+void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ auto k = std::make_unique<kernels::CpuComplexMulKernel>();
+ k->configure(src1, src2, dst);
+ _kernel = std::move(k);
+}
+
+void CpuComplexMul::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
new file mode 100644
index 0000000000..576a357d42
--- /dev/null
+++ b/src/cpu/operators/CpuMul.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_MUL_H
+#define ARM_COMPUTE_CPU_MUL_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuMulKernel */
+class CpuMul : public ICpuOperator
+{
+public:
+ /** Initialise the kernel's inputs, dst and convertion policy.
+ *
+ * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+ * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+ *
+ * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+ * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32).
+ * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] dst dst tensor info. Data types supported:
+ * - U8, only if both inputs are U8.
+ * - QASYMM8, only if both inputs are QASYMM8.
+ * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED.
+ * - S16.
+ * - QSYMM16, only if both inputs are QSYMM16.
+ * - S32, only if both inputs are S32 or both are QSYMM16.
+ * - F16, only if @p src1 is F16.
+ * - F32, only if both inputs are F32.
+ * @param[in] scale Scale to apply after multiplication.
+ * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+ * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
+ * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
+ * @param[in] rounding_policy Rounding policy.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuMul::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+
+/** Basic function to run @ref kernels::CpuComplexMulKernel */
+class CpuComplexMul : public ICpuOperator
+{
+public:
+ /** Initialise the kernel's inputs, dst.
+ *
+ * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+ * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+ * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuComplexMul::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_MUL_H */ \ No newline at end of file
diff --git a/src/cpu/operators/CpuPRelu.h b/src/cpu/operators/CpuPRelu.h
new file mode 100644
index 0000000000..084474e2ba
--- /dev/null
+++ b/src/cpu/operators/CpuPRelu.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_PRELU_H
+#define ARM_COMPUTE_CPU_PRELU_H
+
+#include "src/cpu/operators/CpuElementwise.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */
+using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>;
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_PRELU_H */ \ No newline at end of file
diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
new file mode 100644
index 0000000000..d730815313
--- /dev/null
+++ b/src/cpu/operators/CpuPermute.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPermute.h"
+
+#include "src/cpu/kernels/CpuPermuteKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+{
+ auto k = std::make_unique<kernels::CpuPermuteKernel>();
+ k->configure(src, dst, perm);
+ _kernel = std::move(k);
+}
+
+Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+ return kernels::CpuPermuteKernel::validate(src, dst, perm);
+}
+} // namesapce cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPermute.h b/src/cpu/operators/CpuPermute.h
new file mode 100644
index 0000000000..0e0f3ae8db
--- /dev/null
+++ b/src/cpu/operators/CpuPermute.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_PERMUTE_H
+#define ARM_COMPUTE_CPU_PERMUTE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuPermuteKernel */
+class CpuPermute : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @note Arbitrary permutation vectors are supported with rank not greater than 4
+ *
+ * @param[in] src Source tensor to permute. Data types supported: All
+ * @param[out] dst Destintation tensor. Data types supported: Same as @p src
+ * @param[in] perm Permutation vector
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuPermute::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_PERMUTE_H */
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
new file mode 100644
index 0000000000..6059c75dd2
--- /dev/null
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuPool2d.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/cpu/kernels/CpuPool2dKernel.h"
+#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuPool2d::CpuPool2d()
+ : _pooling_layer_kernel(),
+ _border_handler(),
+ _asm_glue(),
+ _is_global_pooling_layer(false),
+ _data_layout(DataLayout::NCHW),
+ _aux_mem(1)
+{
+}
+
+CpuPool2d::~CpuPool2d() = default;
+
+void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+{
+ // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
+ const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+ // Get data layout
+ _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+
+ // Check if we have Global Pooling Layer
+ const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
+
+ if(run_optimised)
+ {
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>();
+ ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr);
+ pooling_wrapper->configure(src, dst, pool_info, ci);
+
+ // Get kernel's memory requirements
+ constexpr size_t alignment = 4096;
+ const size_t workspace_size = pooling_wrapper->get_working_size(num_threads);
+ _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+
+ _asm_glue = std::move(pooling_wrapper);
+ }
+ else
+ {
+ // Configure pooling kernel
+ auto k = std::make_unique<kernels::CpuPool2dKernel>();
+ k->configure(src, dst, pool_info, indices);
+ _pooling_layer_kernel = std::move(k);
+
+ switch(_data_layout)
+ {
+ case DataLayout::NCHW:
+ {
+ // Configure border depending on operation required (quantize border in case of asymmetric data_type)
+ BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+ PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
+ if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding)
+ {
+ zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+ }
+ auto b = std::make_unique<NEFillBorderKernel>();
+ b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value);
+ _border_handler = std::move(b);
+ break;
+ }
+ case DataLayout::NHWC:
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ }
+}
+
+Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+ const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+
+ if(run_optimised)
+ {
+ return Status{};
+ }
+
+ return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices);
+}
+
+void CpuPool2d::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
+
+ if(_asm_glue)
+ {
+ const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
+ NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
+ }
+ else
+ {
+ switch(_data_layout)
+ {
+ case DataLayout::NCHW:
+ // Fill border
+ NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors);
+
+ // Run pooling layer
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+ break;
+ case DataLayout::NHWC:
+ // Run pooling layer
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data layout not supported");
+ }
+ }
+}
+
+experimental::MemoryRequirements CpuPool2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
new file mode 100644
index 0000000000..471637164f
--- /dev/null
+++ b/src/cpu/operators/CpuPool2d.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL2D_H
+#define ARM_COMPUTE_CPU_POOL2D_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward Declarations
+struct PoolingLayerInfo;
+
+namespace cpu
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref kernels::CpuPool2dKernel
+ * -# @ref kernels::CpuPool2dAssemblyWrapperKernel
+ */
+class CpuPool2d : public ICpuOperator
+{
+public:
+ CpuPool2d();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d);
+ ~CpuPool2d();
+ /** Set the src and dst tensors.
+ *
+ * @note F16 is supported for pool sizes 2 and 3 only
+ *
+ * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: same as @p src.
+ * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+ * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
+ */
+ void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuPool2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ std::unique_ptr<INEKernel> _pooling_layer_kernel;
+ std::unique_ptr<INEKernel> _border_handler;
+ std::unique_ptr<INEKernel> _asm_glue;
+
+ bool _is_global_pooling_layer;
+ DataLayout _data_layout;
+ experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_POOL2D_H */
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
new file mode 100644
index 0000000000..0bfcc21942
--- /dev/null
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/operators/CpuQuantize.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst));
+ return Status{};
+}
+
+void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Configure quantize kernel
+ auto k = std::make_unique<kernels::CpuQuantizeKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+void CpuQuantize::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuQuantize.h b/src/cpu/operators/CpuQuantize.h
new file mode 100644
index 0000000000..ec1134fee4
--- /dev/null
+++ b/src/cpu/operators/CpuQuantize.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_QUANTIZE_H
+#define ARM_COMPUTE_CPU_QUANTIZE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */
+class CpuQuantize : public ICpuOperator
+{
+public:
+ /** Set the input and output tensors.
+ *
+ * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+ * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuQuantize::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
new file mode 100644
index 0000000000..5d2b052e34
--- /dev/null
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuReshape.h"
+
+#include "src/cpu/kernels/CpuReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuReshapeKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::CpuReshapeKernel::validate(src, dst);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
new file mode 100644
index 0000000000..92dcb09aa9
--- /dev/null
+++ b/src/cpu/operators/CpuReshape.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_RESHAPE_H
+#define ARM_COMPUTE_CPU_RESHAPE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuReshapeKernel */
+class CpuReshape : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] src Source tensor info. Data type supported: All
+ * @param[out] dst Destination info. Data type supported: Same as @p src
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuReshape::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_RESHAPE_H */
diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
new file mode 100644
index 0000000000..9e35bccec5
--- /dev/null
+++ b/src/cpu/operators/CpuScale.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuScale.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/cpu/kernels/CpuScaleKernel.h"
+#include "support/Rounding.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+{
+ ARM_COMPUTE_ERROR_ON(offsets == nullptr);
+ float sampling_offset = 0.0f;
+ if(sampling_policy == SamplingPolicy::CENTER)
+ {
+ sampling_offset = 0.5f;
+ }
+
+ Window win;
+ win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
+ win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
+
+ if(dx != nullptr && dy != nullptr)
+ {
+ // Pre-compute the offset and pixel's distance for BILINEAR interpolation
+ Iterator offsets_it(offsets, win);
+ Iterator dx_it(dx, win);
+ Iterator dy_it(dy, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
+ const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
+ const int in_xi = std::floor(in_x);
+ const int in_yi = std::floor(in_y);
+
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+ *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
+ *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
+ },
+ offsets_it, dx_it, dy_it);
+ }
+ else
+ {
+ // Pre-compute the offset for NEAREST interpolation
+ Iterator offsets_it(offsets, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
+ {
+ const float float_in_xi = (id.x() + sampling_offset) * wr;
+ const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+ },
+ offsets_it);
+ }
+}
+} // namespace
+
+void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info));
+
+ _scale_info = info;
+ _is_prepared = false;
+
+ // Get data layout and width/height indices
+ _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+ // Compute the ratio between source width/height and destination width/height
+ const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+
+ // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+ InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
+ && hr <= 1.f) ?
+ InterpolationPolicy::NEAREST_NEIGHBOR :
+ _scale_info.interpolation_policy;
+
+ // Get the tensor shape
+ TensorShape shape(dst->dimension(idx_width));
+ shape.set(1, dst->dimension(idx_height), false);
+
+ TensorInfo tensor_info_offsets(shape, Format::S32);
+ TensorInfo tensor_info_dxdy(shape, Format::F32);
+
+ auto dx = std::make_unique<TensorInfo>(tensor_info_dxdy);
+ auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy);
+ auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets);
+ auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
+ switch(policy_to_use)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ {
+ scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info);
+ break;
+ }
+ case InterpolationPolicy::BILINEAR:
+ {
+ scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info);
+ break;
+ }
+ case InterpolationPolicy::AREA:
+ {
+ scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+ }
+ _kernel = std::move(scale_kernel);
+}
+
+Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+
+ ITensorInfo *offsets = nullptr;
+ ITensorInfo *dx = nullptr;
+ ITensorInfo *dy = nullptr;
+
+ // Get data layout and width/height indices
+ const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ // Compute the ratio between source width/height and destination width/height
+ const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+
+ // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+ InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+
+ // Get the tensor shape of auxilary buffers
+ const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
+ TensorInfo tensor_info_offsets(shape, Format::S32);
+ TensorInfo tensor_info_dx(shape, Format::F32);
+ TensorInfo tensor_info_dy(shape, Format::F32);
+ switch(policy_to_use)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ offsets = &tensor_info_offsets;
+ break;
+ case InterpolationPolicy::BILINEAR:
+ offsets = &tensor_info_offsets;
+ dx = &tensor_info_dx;
+ dy = &tensor_info_dy;
+ break;
+ default:
+ break;
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+ return Status{};
+}
+
+void CpuScale::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ _is_prepared = true;
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+ auto dx = tensors.get_tensor(TensorType::ACL_INT_0);
+ auto dy = tensors.get_tensor(TensorType::ACL_INT_1);
+ auto offsets = tensors.get_tensor(TensorType::ACL_INT_2);
+
+ // Get data layout and width/height indices
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+ // Compute the ratio between source width/height and destination width/height
+ const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+
+ // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+ InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
+ && hr <= 1.f) ?
+ InterpolationPolicy::NEAREST_NEIGHBOR :
+ _scale_info.interpolation_policy;
+ const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
+
+ switch(policy_to_use)
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ {
+ // Pre-compute offsets for nearest interpolation
+ precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used);
+ break;
+ }
+ case InterpolationPolicy::BILINEAR:
+ {
+ // Pre-compute dx, dy and offsets for bilinear interpolation
+ precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used);
+ break;
+ }
+ case InterpolationPolicy::AREA:
+ {
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+ }
+ }
+}
+
+void CpuScale::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+ prepare(tensors);
+ NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
new file mode 100644
index 0000000000..f605af6712
--- /dev/null
+++ b/src/cpu/operators/CpuScale.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_SCALE_H
+#define ARM_COMPUTE_CPU_SCALE_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to compute Scale */
+class CpuScale : public ICpuOperator
+{
+public:
+ /** Initialize the function's source, destination, interpolation type and border_mode.
+ *
+ * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+ * @param[out] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+ * @param[in] info @ref ScaleKernelInfo to be used for configuration
+ */
+ void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuScale::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+ // Inherited methods overridden:
+ void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+
+private:
+ ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
+ DataLayout _data_layout{ DataLayout::UNKNOWN };
+ bool _is_prepared{ false };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_SCALE_H */
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
new file mode 100644
index 0000000000..b70ee7e4df
--- /dev/null
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuSoftmax.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/SoftmaxHelpers.h"
+#include "src/cpu/kernels/CpuSoftmaxKernel.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <bool IS_LOG>
+CpuSoftmaxGeneric<IS_LOG>::CpuSoftmaxGeneric()
+ : _permute_input(),
+ _permute_output(),
+ _max_kernel(),
+ _softmax_kernel(),
+ _max(),
+ _tmp(),
+ _input_permuted(),
+ _output_permuted(),
+ _needs_permute(false),
+ _aux_mem(InternalTensorIdx::COUNT)
+{
+}
+
+template <bool IS_LOG>
+void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis)
+{
+ // Perform validation step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
+
+ const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+
+ _needs_permute = actual_axis > 0;
+
+ if(_needs_permute)
+ {
+ _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+ }
+
+ // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
+ // or it is the original input case (2D case)
+ const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src);
+
+ // Create intermediate tensors shapes
+ TensorShape max_sum_shape = tmp_input->tensor_shape();
+ max_sum_shape.set(0, 1);
+ const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+ TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+
+ // Init intermediate tensors
+ _max = TensorInfo(max_info);
+ _tmp = TensorInfo(tensor_info_tmp);
+
+ // Configure kernels
+ auto mk = std::make_unique<kernels::CpuLogits1DMaxKernel>();
+ mk->configure(tmp_input, &_max);
+ _max_kernel = std::move(mk);
+
+ auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
+ if(_needs_permute)
+ {
+ // The normalization kernel stores the result in a permuted output tensor
+ sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
+
+ // Re-permute the permuted output into the requested (4D) output
+ _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+ }
+ else
+ {
+ // Softmax 2D case
+ sm->configure(tmp_input, &_max, dst, beta, &_tmp);
+ }
+ _softmax_kernel = std::move(sm);
+
+ _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
+ _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+
+ _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
+ _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
+}
+
+template <bool IS_LOG>
+Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis)
+{
+ // Perform validation step
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+ ARM_COMPUTE_UNUSED(beta);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
+
+ // Create intermediate tensor info
+ DataType tmp_data_type = src->data_type();
+ const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true));
+
+ TensorShape max_sum_shape = src->tensor_shape();
+ max_sum_shape.set(0, 1);
+ const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
+ const TensorInfo dont_care;
+
+ const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+
+ const bool needs_permute = actual_axis > 0;
+
+ if(needs_permute)
+ {
+ const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+ const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
+ TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
+ TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
+ }
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+
+ return Status{};
+}
+
+template <bool IS_LOG>
+void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true);
+ CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
+
+ CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
+ CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
+
+ ITensorPack max_pack;
+ ITensorPack softmax_pack;
+
+ if(_needs_permute)
+ {
+ ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
+ _permute_input.run(permute_in_pack);
+
+ max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
+
+ softmax_pack =
+ {
+ { TensorType::ACL_SRC_0, input_permuted.get() },
+ { TensorType::ACL_SRC_1, max.get() },
+ { TensorType::ACL_DST_0, output_permuted.get() },
+ { TensorType::ACL_DST_1, tmp.get() }
+ };
+ }
+ else
+ {
+ max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
+
+ softmax_pack =
+ {
+ { TensorType::ACL_SRC_0, src },
+ { TensorType::ACL_SRC_1, max.get() },
+ { TensorType::ACL_DST_0, dst },
+ { TensorType::ACL_DST_1, tmp.get() }
+ };
+ }
+
+ NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
+ NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
+
+ if(_needs_permute)
+ {
+ ITensorPack permute_out_pack;
+ permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
+ permute_out_pack.add_tensor(TensorType::ACL_DST, dst);
+ _permute_output.run(permute_out_pack);
+ }
+}
+
+template <bool IS_LOG>
+experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
+{
+ return _aux_mem;
+}
+
+template class CpuSoftmaxGeneric<false>;
+template class CpuSoftmaxGeneric<true>;
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
new file mode 100644
index 0000000000..20f3f006d3
--- /dev/null
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
+#define ARM_COMPUTE_CPU_SOFTMAX_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/cpu/ICpuKernel.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuLogits1DMaxKernel;
+template <bool IS_LOG>
+class CpuLogits1DSoftmaxKernel;
+
+/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
+ *
+ * Softmax is calculated by :
+ * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
+ *
+ * Log Softmax is calculated by :
+ * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
+ *
+ * This function runs the following function/kernels:
+ * -# If axis is not 0:
+ * -# @ref CpuPermute
+ * -# @ref kernels::CpuLogits1DMaxKernel
+ * -# @ref kernels::CpuLogits1DSoftmaxKernel
+ */
+template <bool IS_LOG = false>
+class CpuSoftmaxGeneric : public ICpuOperator
+{
+public:
+ CpuSoftmaxGeneric();
+ /** Set the input and output tensors.
+ *
+ * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+ * last value of each row to the nearest multiple.
+ * @param[out] dst Destination tensor ifo. Data types supported: same as @p input.
+ * @param[in] beta (Optional) A scaling factor for the exponent.
+ * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+ * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuSoftmaxGeneric::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum InternalTensorIdx
+ {
+ MAX = 0,
+ TMP,
+ PERMUTED_SRC,
+ PERMUTED_DST,
+ COUNT
+ };
+
+ CpuPermute _permute_input;
+ CpuPermute _permute_output;
+ std::unique_ptr<ICpuKernel> _max_kernel;
+ std::unique_ptr<ICpuKernel> _softmax_kernel;
+
+ TensorInfo _max;
+ TensorInfo _tmp;
+ TensorInfo _input_permuted;
+ TensorInfo _output_permuted;
+
+ bool _needs_permute;
+ experimental::MemoryRequirements _aux_mem{};
+};
+using CpuSoftmax = CpuSoftmaxGeneric<false>;
+using CpuLogSoftmax = CpuSoftmaxGeneric<true>;
+
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
new file mode 100644
index 0000000000..0485a595c7
--- /dev/null
+++ b/src/cpu/operators/CpuSub.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuSub.h"
+
+#include "src/cpu/kernels/CpuSubKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(act_info);
+ auto k = std::make_unique<kernels::CpuSubKernel>();
+ k->configure(src0, src1, dst, policy);
+ _kernel = std::move(k);
+}
+
+Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+ return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
new file mode 100644
index 0000000000..025c928d8f
--- /dev/null
+++ b/src/cpu/operators/CpuSub.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_SUB_H
+#define ARM_COMPUTE_CPU_SUB_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuSubKernel */
+class CpuSub : public ICpuOperator
+{
+public:
+ /** Initialise the kernel's inputs, dst and conversion policy.
+ *
+ * Valid configurations (src0,src1) -> dst :
+ *
+ * - (U8,U8) -> U8
+ * - (QASYMM8, QASYMM8) -> QASYMM8
+ * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+ * - (S16,S16) -> S16
+ * - (S32,S32) -> S32
+ * - (F16,F16) -> F16
+ * - (F32,F32) -> F32
+ *
+ * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+ * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+ * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+ * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+ */
+ void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuSub::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_SUB_H */ \ No newline at end of file
diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
new file mode 100644
index 0000000000..518227b464
--- /dev/null
+++ b/src/cpu/operators/CpuTranspose.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuTranspose.h"
+
+#include "src/cpu/kernels/CpuTransposeKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst)
+{
+ auto k = std::make_unique<kernels::CpuTransposeKernel>();
+ k->configure(src, dst);
+ _kernel = std::move(k);
+}
+
+Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+ return kernels::CpuTransposeKernel::validate(src, dst);
+}
+} // namesapce cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuTranspose.h b/src/cpu/operators/CpuTranspose.h
new file mode 100644
index 0000000000..8934481ef6
--- /dev/null
+++ b/src/cpu/operators/CpuTranspose.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H
+#define ARM_COMPUTE_CPU_TRANSPOSE_H
+
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref kernels::CpuTransposeKernel */
+class CpuTranspose : public ICpuOperator
+{
+public:
+ /** Configure operator for a given list of arguments
+ *
+ * @param[in] src Source tensor to permute. Data types supported: All
+ * @param[out] dst Destintation tensor. Data types supported: Same as @p src
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to @ref CpuTranspose::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
new file mode 100644
index 0000000000..8fca836b8e
--- /dev/null
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -0,0 +1,839 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
+#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+#include "support/Cast.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+
+namespace
+{
+arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
+{
+ switch(act_info.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ {
+ return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
+ }
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ {
+ return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
+ }
+ default:
+ {
+ return arm_gemm::Activation(arm_gemm::Activation::Type::None);
+ }
+ }
+}
+
+inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ if(src->data_type() == DataType::F32)
+ {
+ if(input_dims.width > 4 && input_dims.height > 4)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 4, 4, 3, 3>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 2, 3, 3>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
+ }
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else if(src->data_type() == DataType::F16)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
+ }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_5x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 2, 5, 5>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, dst, winograd_info)));
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_3x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 6, 1, 3>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, dst, winograd_info)));
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_1x3(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 6, 1, 3, 1>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
+
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_5x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 4, 1, 5>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, dst, winograd_info)));
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+inline Status validate_kernel_1x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 4, 1, 5, 1>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_7x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 1, 2, 1, 7>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, dst, winograd_info)));
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Status validate_kernel_1x7(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+ const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<float, 2, 1, 7, 1>::validate(src, input0, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, dst, winograd_info)));
+
+ if(act_info.enabled())
+ {
+ CpuActivation::validate(dst, nullptr, act_info);
+ }
+ return Status{};
+}
+
+inline Tensor4DShape internal_get_input_shape(const ITensorInfo *src)
+{
+ const DataLayout data_layout = src->data_layout();
+ const int in_width = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+ const int in_height = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+ const int in_channels = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const int in_batches = src->dimension(3);
+
+ return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+{
+ ARM_COMPUTE_UNUSED(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+ return ICpuWinogradConv2dTransformWeightsKernel::validate(src, weights);
+}
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
+{
+ Size2D output_tile = Size2D{};
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ if(data_type == DataType::F16)
+ {
+ output_tile = Size2D(4U, 4U);
+ }
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(2U, 2U);
+ }
+ else if(kernel_dims == Size2D(1U, 3U))
+ {
+ output_tile = Size2D(1U, 6U);
+ }
+ else if(kernel_dims == Size2D(3U, 1U))
+ {
+ output_tile = Size2D(6U, 1U);
+ }
+ else if(kernel_dims == Size2D(1U, 5U))
+ {
+ output_tile = Size2D(1U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 1U))
+ {
+ output_tile = Size2D(4U, 1U);
+ }
+ else if(kernel_dims == Size2D(7U, 1U))
+ {
+ output_tile = Size2D(2U, 1U);
+ }
+ else if(kernel_dims == Size2D(1U, 7U))
+ {
+ output_tile = Size2D(1U, 2U);
+ }
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
+ {
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
+ };
+
+ const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
+ {
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ switch(data_type)
+ {
+ case DataType::F16:
+ return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
+ case DataType::F32:
+ return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
+ default:
+ return false;
+ }
+}
+
+inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
+{
+ return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+}
+
+} // namespace
+
+CpuWinogradConv2d::CpuWinogradConv2d()
+ : _gemm_function(std::make_unique<CpuGemm>()),
+ _activation_func(std::make_unique<CpuActivation>()),
+ _permute_input(std::make_unique<CpuPermute>()),
+ _permute_output(std::make_unique<CpuPermute>()),
+ _permute_weights(std::make_unique<CpuPermute>()),
+ _transform_input_kernel(nullptr),
+ _transform_weights_kernel(nullptr),
+ _transform_output_kernel(nullptr),
+ _data_layout(),
+ _aux_mem(AuxTensorIdx::Count),
+ _input_nhwc(),
+ _output_nhwc(),
+ _input_workspace(),
+ _kernel_storage(),
+ _output_workspace(),
+ _input_transformed(),
+ _output_transformed(),
+ _weights_hwio(),
+ _run_activation(false),
+ _is_prepared(false)
+{
+}
+
+CpuWinogradConv2d::~CpuWinogradConv2d() = default;
+
+void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
+ const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info));
+
+ // Get indices for the width and height
+ _data_layout = src->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+
+ const Size2D input_dims = Size2D(src->dimension(width_idx), src->dimension(height_idx));
+ const Size2D kernel_size = Size2D(weights->dimension(width_idx), weights->dimension(height_idx));
+ const DataType data_type = src->data_type();
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
+ "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ _is_prepared = false;
+
+ std::unique_ptr<ICpuWinogradConv2dTransformInputKernel> transform_input_kernel;
+ std::unique_ptr<ICpuWinogradConv2dTransformWeightsKernel> transform_weights_kernel;
+ std::unique_ptr<ICpuWinogradConv2dTransformOutputKernel> transform_output_kernel;
+
+ int n_gemms = 1;
+ int N_BLOCK = 1; // Size of block used by GEMM.
+ if(data_type == DataType::F32)
+ {
+ if(kernel_size == Size2D(3, 3))
+ {
+ if(src->dimension(width_idx) > 4 && src->dimension(height_idx) > 4)
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 4, 4, 3, 3>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 2, 2, 3, 3>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ }
+ else if(kernel_size == Size2D(5, 5))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 2, 2, 5, 5>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(1, 3))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 6, 1, 3, 1>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(3, 1))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 1, 6, 1, 3>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(1, 5))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 4, 1, 5, 1>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(5, 1))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 1, 4, 1, 5>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(1, 7))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 2, 1, 7, 1>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else if(kernel_size == Size2D(7, 1))
+ {
+ using config = CpuWinogradConv2dConfiguration<float, float, 1, 2, 1, 7>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ }
+ }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else if(data_type == DataType::F16)
+ {
+ if(kernel_size == Size2D(3, 3))
+ {
+ using config = CpuWinogradConv2dConfiguration<__fp16, __fp16, 4, 4, 3, 3>;
+ transform_input_kernel = std::make_unique<config::TransformInputKernel>();
+ transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
+ transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
+ n_gemms = config::WinogradBase::N_GEMMS;
+ N_BLOCK = config::WinogradConv::N_BLOCK;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ }
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported.");
+ }
+
+ const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
+ const bool use_same_padding = use_padding_type == PADDING_SAME;
+
+ // Get convolved dimensions
+ const int in_channels = src->dimension(channel_idx);
+ const int out_channels = dst->dimension(channel_idx);
+
+ const Tensor4DShape in_shape(internal_get_input_shape(src));
+ const size_t data_type_size = src->element_size();
+ // Get the memory required to instantiate a new Winograd operator.
+ constexpr size_t storage_alignment = 64;
+
+ // Kernel Storage
+ const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+
+ // Input storage
+ const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
+
+ // Output storage
+ const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
+ const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
+ const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
+ const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
+ const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
+
+ // Configure GEMM
+ const int tile_rows = iceildiv(output_shape.first, output_tile.height);
+ const int tile_cols = iceildiv(output_shape.second, output_tile.width);
+ const int m = in_shape.n_batches * tile_rows * tile_cols;
+ const int k = in_shape.n_channels;
+ const int n = out_channels;
+ const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
+ const int output_matrix_row_stride = kernel_matrix_row_stride;
+
+ TensorShape a_shape(k, m, 1, n_gemms);
+ Strides a_strides(data_type_size);
+ a_strides.set(1, a_strides[0] * k);
+ //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
+ a_strides.set(2, 0);
+ a_strides.set(3, data_type_size * input_matrix_stride);
+
+ TensorShape b_shape(n, k, n_gemms);
+ Strides b_strides(data_type_size);
+ b_strides.set(1, data_type_size * kernel_matrix_row_stride);
+ b_strides.set(2, data_type_size * kernel_matrix_stride);
+
+ TensorShape d_shape(n, m, 1, n_gemms);
+ Strides d_strides(data_type_size);
+ d_strides.set(1, data_type_size * output_matrix_row_stride);
+ //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
+ d_strides.set(2, 0);
+ d_strides.set(3, data_type_size * output_matrix_stride);
+
+ TensorInfo a_info{};
+ TensorInfo b_info{};
+ TensorInfo d_info{};
+ a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
+ b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
+ d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
+
+ _input_transformed = a_info;
+ _kernel_storage = b_info;
+ _output_transformed = d_info;
+
+ const ITensorInfo *input_to_use = src;
+ ITensorInfo *output_to_use = dst;
+ PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
+ const unsigned int max_num_threads = NEScheduler::get().num_threads();
+
+ // Configure the kernel to transform the input tensor from NCHW -> NHWC
+ if(_data_layout == DataLayout::NCHW)
+ {
+ _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ input_to_use = &_input_nhwc;
+ weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
+ }
+
+ // Configure input transform kernel
+ transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+ &_input_transformed, input_matrix_stride, &_input_workspace);
+ const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
+ TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
+ _input_workspace = input_workspace_info;
+
+ // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+ _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
+ transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
+ // Configure GEMM function
+ _gemm_function->configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
+
+ // Configure output transform function
+ // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
+ if(_data_layout == DataLayout::NCHW)
+ {
+ // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+ TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
+ dst->dimension(1), dst->dimension(3)),
+ 1, dst->data_type());
+ _output_nhwc = info;
+ output_to_use = &_output_nhwc;
+ }
+ const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
+
+ transform_output_kernel->configure(biases,
+ &_output_transformed,
+ output_matrix_stride,
+ output_to_use,
+ in_shape.n_batches,
+ output_shape.first,
+ output_shape.second,
+ out_channels,
+ &_output_workspace,
+ activation);
+
+ const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
+ TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
+ _output_workspace = output_workspace_info;
+
+ // Reorder the convoluted output to ACL's ordering NCHW
+ if(_data_layout == DataLayout::NCHW)
+ {
+ _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
+ }
+
+ _transform_input_kernel = std::move(transform_input_kernel);
+ _transform_weights_kernel = std::move(transform_weights_kernel);
+ _transform_output_kernel = std::move(transform_output_kernel);
+
+ //Configure Activation Layer
+ _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
+ if(_run_activation)
+ {
+ _activation_func->configure(dst, nullptr, act_info);
+ }
+
+ auto asm_mem_req = _gemm_function->workspace();
+ _aux_mem[GemmWorkspace] = asm_mem_req[GemmWorkspace];
+ _aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
+ _aux_mem[InterleavedLHS] = asm_mem_req[InterleavedLHS];
+ _aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS];
+ _aux_mem[TempResult] = asm_mem_req[TempResult];
+
+ // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
+ _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, input_storage_size, storage_alignment);
+ _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, output_storage_size, storage_alignment);
+ _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
+ _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+ _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment);
+ if(_data_layout == DataLayout::NCHW)
+ {
+ _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
+ _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
+ }
+}
+
+Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
+
+ // Get indices for the width and height
+ const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(src->dimension(idx_width), src->dimension(idx_height));
+ const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
+ const DataType data_type = src->data_type();
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
+ "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ const WinogradInfo winograd_info = WinogradInfo(output_tile,
+ kernel_size,
+ input_dims,
+ conv_info,
+ src->data_layout());
+
+ // Validate input transform
+ const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+ const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
+ // Validate filter transform
+ const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+ const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+ // Validate batched matrix multiply
+ TensorShape batched_mm_output_shape = input0.tensor_shape();
+ batched_mm_output_shape[0] = input1.tensor_shape()[0];
+ const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+
+ if(kernel_size == Size2D(3, 3))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ return validate_kernel_3x3(input_dims, src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(5, 5))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+ return validate_kernel_5x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ if(kernel_size == Size2D(3, 1))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_3x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(1, 3))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_1x3(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(5, 1))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_5x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(1, 5))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_1x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(7, 1))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_7x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else if(kernel_size == Size2D(1, 7))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+ return validate_kernel_1x7(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
+ }
+}
+
+void CpuWinogradConv2d::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto a = tensors.get_const_tensor(ACL_SRC_0);
+ auto c = tensors.get_const_tensor(ACL_SRC_2);
+ auto d = tensors.get_tensor(ACL_DST);
+
+ CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
+ CpuAuxTensorHandler input_transformed(offset_int_vec(TransformedInput), _input_transformed, tensors, true);
+ CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
+
+ const bool is_nchw = _data_layout == DataLayout::NCHW;
+ if(is_nchw)
+ {
+ //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+ ITensorPack pack{ { ACL_SRC, a }, { ACL_DST, input_nhwc.get() } };
+ _permute_input->run(pack);
+ }
+
+ // Transform input tensor to the winograd domain
+ ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : a }, { ACL_DST, input_transformed.get() }, { ACL_INT, input_workspace.get() } };
+ NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, _transform_input_kernel->window(), transform_input_pack);
+
+ CpuAuxTensorHandler output_transformed(offset_int_vec(TransformedOutput), _output_transformed, tensors, true);
+ CpuAuxTensorHandler weights_transformed(offset_int_vec(TransformedWeights), _kernel_storage, tensors, true);
+
+ // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(ACL_SRC, input_transformed.get());
+ gemm_pack.add_const_tensor(ACL_SRC_1, weights_transformed.get());
+ gemm_pack.add_const_tensor(ACL_BIAS, nullptr);
+ gemm_pack.add_tensor(ACL_DST, output_transformed.get());
+ _gemm_function->run(gemm_pack);
+
+ // Transform output tensor to the spatial domain
+ CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
+ CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
+ ITensorPack transform_output_pack{ { ACL_SRC_0, c }, { ACL_SRC_1, output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : d }, { ACL_INT, output_workspace.get() } };
+ NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, _transform_output_kernel->window(), transform_output_pack);
+
+ if(is_nchw)
+ {
+ // Reorder the convoluted output to ACL's ordering NCHW
+ ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, d } };
+ _permute_output->run(pack);
+ }
+
+ if(_run_activation)
+ {
+ ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ _activation_func->run(pack);
+ }
+}
+
+void CpuWinogradConv2d::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ // Permute weights
+ const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
+
+ CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
+ ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+ _permute_weights->run(permute_tensors);
+
+ // Transform weights
+ ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
+
+ CpuAuxTensorHandler transformed_weights(_kernel_storage, *weights_transf);
+ ITensorPack transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } };
+ NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors);
+
+ ITensorPack gemm_pack = tensors;
+ gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get());
+ _gemm_function->prepare(gemm_pack);
+
+ _is_prepared = true;
+ }
+}
+
+experimental::MemoryRequirements CpuWinogradConv2d::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
new file mode 100644
index 0000000000..0abd110f73
--- /dev/null
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuActivation.h"
+#include "src/cpu/operators/CpuGemm.h"
+#include "src/cpu/operators/CpuPermute.h"
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuWinogradConv2d : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuWinogradConv2d();
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWinogradConv2d);
+ /** Destructor */
+ ~CpuWinogradConv2d();
+
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:--------------|:------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ *
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+ * Currently only 3x3 and 5x5 kernels are supported.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+ * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+ * available which may introduce a drop of accuracy as well. Default is false
+ */
+ void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
+ /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
+ *
+ * Similar to CpuWinogradConv2d::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ GemmWorkspace = 0,
+ Pretranspose = 1,
+ InterleavedLHS = 2,
+ TransposedRHS = 3,
+ TempResult = 4,
+ TransformedInput = 5,
+ TransformedOutput = 6,
+ WorkspaceIO = 7,
+ TransformedWeights = 8,
+ PermutedWeights = 9,
+ PermutedInput = TransformedOutput,
+ PermutedOutput = TransformedInput,
+ Count = 10
+ };
+
+ std::unique_ptr<CpuGemm> _gemm_function;
+ std::unique_ptr<CpuActivation> _activation_func;
+ std::unique_ptr<CpuPermute> _permute_input;
+ std::unique_ptr<CpuPermute> _permute_output;
+ std::unique_ptr<CpuPermute> _permute_weights;
+ std::unique_ptr<ICPPKernel> _transform_input_kernel;
+ std::unique_ptr<ICPPKernel> _transform_weights_kernel;
+ std::unique_ptr<ICPPKernel> _transform_output_kernel;
+
+ DataLayout _data_layout;
+ experimental::MemoryRequirements _aux_mem{ Count };
+ TensorInfo _input_nhwc;
+ TensorInfo _output_nhwc;
+ TensorInfo _input_workspace;
+ TensorInfo _kernel_storage;
+ TensorInfo _output_workspace;
+ TensorInfo _input_transformed;
+ TensorInfo _output_transformed;
+ TensorInfo _weights_hwio;
+ bool _run_activation;
+ bool _is_prepared;
+};
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H */
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
new file mode 100644
index 0000000000..97893b0672
--- /dev/null
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/utils/AssemblyUtils.h"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
+#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/utils/CpuAuxTensorHandler.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::experimental;
+
+namespace
+{
+struct free_delete
+{
+ void operator()(void *x)
+ {
+ free(x);
+ }
+};
+
+struct Params
+{
+ unsigned int M;
+ unsigned int N;
+ unsigned int K;
+ unsigned int batches;
+ unsigned int multis;
+ unsigned int sections;
+ bool indirect;
+};
+
+Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+ Params p;
+ p.M = d->tensor_shape().y();
+ p.K = a->tensor_shape().x();
+ p.N = d->tensor_shape().x();
+ p.batches = 1;
+ p.multis = 1;
+ p.sections = 1;
+ p.indirect = false;
+
+ if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+ {
+ p.indirect = true;
+ p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
+ }
+ else
+ {
+ p.multis = b->tensor_shape().z();
+ p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
+ }
+
+ // Update M in case of GEMM3D for output
+ if(info.depth_output_gemm3d != 0)
+ {
+ p.M = d->tensor_shape().y() * d->tensor_shape().z();
+ p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
+ }
+
+ return p;
+}
+
+IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
+{
+ // Schedule assembly kernel
+ const int granule_threshold = 200;
+ IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
+ if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+ {
+ scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+ }
+ else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+ {
+ //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+ scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ }
+ else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+ {
+ //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
+ scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ }
+
+ return scheduling_hint;
+}
+
+/** Fallback in case ACL doesn't have a function */
+template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing>
+class Fallback : public CpuGemmAssemblyDispatch::IFallback
+{
+public:
+ /** Destructor */
+ ~Fallback() = default;
+
+ /** Initialise the functions's input and output.
+ *
+ * @param[in] a Input tensor containing the Matrix A.
+ * @param[in] b Input tensor containing the Matrix B.
+ * @param[in] c Input tensor containing the Matrix C.
+ * @param[out] d Output tensor to store the result of matrix multiplication.
+ * @param[in] args Matrix multiplication information.
+ * @param[in] gemm_info GEMM meta-data
+ * @param[in] os Output stage meta-data.
+ */
+ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+ arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+ const OutputStage &os = {});
+
+ /** Set requantization shifts to be used
+ *
+ * @param[in] shifts Requantization shifts
+ *
+ * @return Pointer to the shift data
+ */
+ /** Set requantization data to be used
+ *
+ *
+ * @param shifts Requantization shifts
+ * @param multipliers Requantization multipliers
+ *
+ * @return A tuple with the pointers to the shift and multiplier data respectively
+ */
+ std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
+ const std::vector<int32_t> &multipliers);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ bool is_configured() const override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ enum AuxTensorIdx
+ {
+ AsmGemmWorkspace = 0,
+ Pretranspose,
+ Count
+ };
+
+ /** Configure the indirect buffer
+ *
+ * @param[in] a Input tensor containing the Matrix A.
+ * @param[in] b Input tensor containing the Matrix B.
+ * @param[out] d Output tensor to store the result of matrix multiplication.
+ * @param[in] info GEMM meta-data
+ */
+ void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
+ /** Prepare the indirect buffer */
+ void prepare_indirect_buffer(ITensorPack &tensors);
+
+ /** Assembly Gemm kernel */
+ std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+ /** Optimised Arm® Neon™ kernel */
+ std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+ /** Assembly GEMM workspace tensor info */
+ TensorInfo _workspace_info{};
+ /** Pre-transpose tensor info */
+ TensorInfo _pretranspose_info{};
+ /** Prepared flag */
+ bool _is_prepared{ false };
+ /** GEMM meta-data */
+ AsmGemmInfo _gemm_info{};
+ /** GEMM kernel description */
+ arm_gemm::KernelDescription _kernel_info{};
+ /** Per channel quantization shifts */
+ std::vector<int32_t> _shifts{};
+ std::vector<int32_t> right_shifts{};
+ std::vector<int32_t> left_shifts{};
+ /** Per channel quantization multipliers */
+ std::vector<int32_t> _multipliers{};
+ /** Indirect buffer */
+ std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
+ std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
+ std::vector<TypeInput> _indirect_pad{};
+ arm_gemm::ConvolutionParameters _cp{};
+ experimental::MemoryRequirements _aux_mem{ Count };
+};
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+{
+ _multipliers = multipliers;
+ _shifts = shifts;
+ bool need_left = false;
+ for(const auto s : _shifts)
+ {
+ left_shifts.push_back(std::max(-s, int32_t(0)));
+ right_shifts.push_back(std::min(-s, int32_t(0)));
+ if(s < 0 && !need_left)
+ {
+ need_left = true;
+ }
+ }
+ return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
+{
+ auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer());
+ const int multis = 1;
+ const int batches = a->info()->tensor_shape().total_size_upper(3);
+ const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+ const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
+
+ const size_t output_hw = _cp.output_height * _cp.output_width;
+ const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
+ const size_t batch_stride = batch_size / sizeof(TypeInput);
+ const int multi_size = batch_size * batches;
+ const size_t multi_stride = multi_size / sizeof(TypeInput);
+
+ for(int64_t m = 0; m < multis; m++)
+ {
+ for(int64_t b = 0; b < batches; b++)
+ {
+ for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+ {
+ for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+ {
+ int64_t output_xy = (output_y * _cp.output_width) + output_x;
+
+ for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+ {
+ for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+ {
+ int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
+ int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
+ int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
+ int64_t input_xy = (input_y * _cp.input_width) + input_x;
+
+ if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+ {
+ _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+ }
+ else
+ {
+ _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+ A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
+
+ float zeropad = 0.f;
+ if(is_data_type_quantized(a->data_type()))
+ {
+ zeropad = a->quantization_info().uniform().offset;
+ }
+
+ const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]);
+ const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]);
+ const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
+ const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]);
+ const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]);
+ const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]);
+ const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]);
+
+ _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
+ info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
+ };
+
+ if(info.method == AsmConvMethod::Conv)
+ {
+ _gemm_kernel_asm->set_convolution_parameters(_cp);
+ }
+
+ if(info.method == AsmConvMethod::Indirect)
+ {
+ const unsigned int multis = 1;
+ const unsigned int batches = a->tensor_shape().total_size_upper(3);
+ const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
+ const unsigned int output_hw = _cp.output_width * _cp.output_height;
+
+ using TypeInputPtr = TypeInput *;
+ const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr);
+ const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
+ const int multi_size = batch_size * batches;
+ const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
+
+ _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+ _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+ _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
+
+ // Set indirect argument
+ int64_t pos = 0;
+ for(int64_t m = 0; m < multis; m++)
+ {
+ for(int64_t b = 0; b < batches; b++)
+ {
+ for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+ {
+ (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+ }
+ }
+ }
+
+ _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
+ }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+ arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+ const OutputStage &os)
+{
+ ARM_COMPUTE_UNUSED(c);
+ arm_gemm::GemmConfig gemm_cfg;
+ _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
+ if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
+ {
+ gemm_cfg.filter = _kernel_info.name;
+ args._cfg = &gemm_cfg;
+ }
+ _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
+ if(_gemm_kernel_asm == nullptr)
+ {
+ //configuration not supported: Leave function unconfigured:
+ return;
+ }
+
+ // arm_compute wrapper for the Gemm object (see above)
+ auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
+ ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr);
+ acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
+ const size_t workspace_size = _gemm_kernel_asm->get_working_size();
+ const unsigned int alignment = 4096;
+ _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
+ _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+
+ //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
+ //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
+ {
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
+ if(window_size < static_cast<unsigned int>(args._maxthreads))
+ {
+ _gemm_kernel_asm->set_nthreads(window_size);
+ }
+ }
+
+ _optimised_kernel = std::move(acl_gemm_wrapper);
+ _gemm_info = gemm_info;
+ // Check for pre-transposed support
+ if(_gemm_kernel_asm->B_pretranspose_required())
+ {
+ // Forcing 128-byte alignment (required by 32-bit kernels)
+ const unsigned int alignment = 128;
+ const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
+ _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
+ _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+ }
+
+ // Handle indirect GEMM convolution
+ if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+ {
+ configure_indirect(a, b, d, gemm_info);
+ }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+
+ // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+ if(c && c->info()->data_type() == DataType::S32)
+ {
+ _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+ }
+
+ // Pretranspose B if required
+ if(_gemm_kernel_asm->B_pretranspose_required())
+ {
+ const int ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+ const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+ CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
+ ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
+ _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
+
+ b->mark_as_unused();
+ }
+
+ if(_gemm_info.method == AsmConvMethod::Indirect)
+ {
+ prepare_indirect_buffer(tensors);
+ }
+
+ _is_prepared = true;
+ }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
+{
+ return _optimised_kernel != nullptr;
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const
+{
+ return _aux_mem;
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
+{
+ auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ auto d = tensors.get_tensor(TensorType::ACL_DST);
+
+ int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ int ldb = 0;
+ const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
+
+ const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
+ const size_t a_multi_idx = a_batch_idx + 1;
+ const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
+ const size_t d_multi_idx = d_batch_idx + 1;
+
+ int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
+ const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
+
+ int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
+ int multi_stride_b = 0;
+ const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
+
+ auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+ const TypeInput *in1_ptr = nullptr;
+ auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
+
+ // Check if B is pre-tranposed and de-reference if not
+ if(!_gemm_kernel_asm->B_is_pretransposed())
+ {
+ ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+ in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+ }
+
+ const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
+
+ // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
+ CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
+ if(workspace.get()->buffer() != nullptr)
+ {
+ _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
+ const unsigned int split_dim = scheduling_hint.split_dimension();
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ }
+ if(split_dim != IScheduler::split_dimensions_all)
+ {
+ // Make sure the kernel does not expect more threads than we can actually spawn
+ const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
+ num_threads = std::min(num_iterations, num_threads);
+ }
+ _gemm_kernel_asm->set_nthreads(num_threads);
+ }
+
+ // Prepare assembly kernel
+ prepare(tensors);
+
+ // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+ TypeOutput *bias = nullptr;
+ if(c && c->info()->data_type() != DataType::S32)
+ {
+ bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
+ }
+
+ if(_gemm_info.method == AsmConvMethod::Indirect)
+ {
+ in0_ptr = nullptr;
+ lda = 0;
+ batch_stride_a = 0;
+ multi_stride_a = 0;
+ }
+
+ // Set gemm parameters
+ _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
+ in1_ptr, ldb, multi_stride_b,
+ out_ptr, ldd, batch_stride_d, multi_stride_d,
+ bias, 0);
+ // Schedule
+ NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+ arm_gemm::Activation activation, const AsmGemmInfo &info)
+{
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
+
+ // Create arm_gemm fallback
+ auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
+ fallback->configure(a, b, c, d, args, info);
+ arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
+ arm_gemm::Activation activation, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_UNUSED(activation);
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
+
+ // Create arm_gemm fallback
+ auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
+
+ // Configure requantization info
+ const int32_t negation = info.negated_offsets ? 1 : -1;
+ const int32_t a_offset = -a->quantization_info().uniform().offset * negation;
+ const int32_t b_offset = -b->quantization_info().uniform().offset * negation;
+ const GEMMLowpOutputStageInfo os_info = info.output_stage;
+
+ arm_gemm::Requantize32 gemm_requant_info{};
+ if(os_info.gemmlowp_shifts.size() > 1)
+ {
+ const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+ gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
+ a_offset, b_offset, os_info.gemmlowp_offset,
+ (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
+ std::get<2>(requantize_data),
+ std::get<3>(requantize_data),
+ os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+ }
+ else
+ {
+ gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
+ a_offset, b_offset, os_info.gemmlowp_offset,
+ -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
+ os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+ }
+
+ // Configure fallback
+ fallback->configure(a, b, c, d, args, info, gemm_requant_info);
+ arm_gemm = std::move(fallback);
+}
+} //namespace
+
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
+ : _arm_gemm(nullptr)
+{
+}
+
+Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_UNUSED(c, info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
+
+#ifndef __aarch64__
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
+#endif /* __aarch64__ */
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ if(is_data_type_quantized_per_channel(b->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
+ return Status{};
+}
+
+bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation)
+{
+ arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation);
+ return act.type != arm_gemm::Activation::Type::None;
+}
+
+void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+ arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
+
+ //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
+ if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+ {
+ return;
+ }
+
+ switch(a->data_type())
+ {
+ case DataType::F32:
+ create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
+ break;
+#ifdef __aarch64__
+ case DataType::U8:
+ case DataType::QASYMM8:
+ if(d->data_type() == DataType::S32)
+ {
+ create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
+ }
+ else
+ {
+ create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info);
+ }
+ break;
+ case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
+ if(d->data_type() == DataType::S32)
+ {
+ create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
+ }
+ else
+ {
+ create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);
+ }
+ break;
+#endif /* __aarch64__ */
+#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
+ case DataType::BFLOAT16:
+ create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info);
+ break;
+#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ case DataType::F16:
+ create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info);
+ break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ default:
+ break;
+ }
+}
+
+void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ _arm_gemm->prepare(tensors);
+}
+
+bool CpuGemmAssemblyDispatch::is_configured() const
+{
+ return _arm_gemm != nullptr && _arm_gemm->is_configured();
+}
+
+void CpuGemmAssemblyDispatch::run(ITensorPack &tensors)
+{
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ _arm_gemm->run(tensors);
+}
+
+experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const
+{
+ ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
+ return _arm_gemm->workspace();
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
new file mode 100644
index 0000000000..a50f3634c2
--- /dev/null
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/* Convolution method supported by the assembly gemm interface */
+enum class AsmConvMethod
+{
+ Im2Col,
+ Indirect,
+ Conv
+};
+
+struct AsmGemmInfo
+{
+ AsmConvMethod method{ AsmConvMethod::Im2Col };
+ PadStrideInfo ps_info{};
+ ActivationLayerInfo activation_info{};
+ GEMMLowpOutputStageInfo output_stage{};
+ bool negated_offsets{ true };
+ bool reinterpret_input_as_3d{ false };
+ bool depth_output_gemm3d{ false };
+ int64_t padding_top{ 0 };
+ int64_t padding_left{ 0 };
+ float padding_value{ 0.f };
+ bool fast_mode{ false };
+};
+
+/** Assembly kernel glue */
+class CpuGemmAssemblyDispatch : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuGemmAssemblyDispatch();
+ /** Defautl destructor */
+ ~CpuGemmAssemblyDispatch() = default;
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch);
+
+ class IFallback
+ {
+ public:
+ virtual void run(ITensorPack &tensors) = 0;
+ virtual void prepare(ITensorPack &tensors) = 0;
+ virtual experimental::MemoryRequirements workspace() const = 0;
+ virtual bool is_configured() const = 0;
+ virtual ~IFallback() = default;
+ };
+
+public:
+ /** If supported create a Compute Library function else fallback to the arm_gemm function.
+ *
+ * @param[in] a Input tensor (Matrix A)
+ * @param[in] b Input tensor (Matrix B)
+ * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations
+ * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+ * @param[in] info GEMM meta-data
+ */
+ void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+
+ /** Indicates whether or not this function can be used to process the given parameters.
+ *
+ * @param[in] a Input tensor info (Matrix A)
+ * @param[in] b Input tensor info (Matrix B)
+ * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations
+ * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+ * @param[in] info GEMM meta-data
+ *
+ * @return a status.
+ */
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+ /** Checks if activation is supported by the gemm assembly dispatcher
+ *
+ * @param[in] activation Activation to check
+ *
+ * @return True if activation is supported else false
+ */
+ static bool is_activation_supported(const ActivationLayerInfo &activation);
+ /** Was the function successfully configured ?
+ *
+ * @return True if the function is configured and ready to run
+ */
+ bool is_configured() const;
+
+ // Inherited methods overridden:
+ void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */