aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/operators
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/operators')
-rw-r--r--src/cpu/operators/CpuActivation.cpp14
-rw-r--r--src/cpu/operators/CpuActivation.h1
-rw-r--r--src/cpu/operators/CpuAdd.cpp17
-rw-r--r--src/cpu/operators/CpuAdd.h13
-rw-r--r--src/cpu/operators/CpuAddMulAdd.cpp85
-rw-r--r--src/cpu/operators/CpuAddMulAdd.h26
-rw-r--r--src/cpu/operators/CpuCast.cpp3
-rw-r--r--src/cpu/operators/CpuConcatenate.cpp34
-rw-r--r--src/cpu/operators/CpuConcatenate.h4
-rw-r--r--src/cpu/operators/CpuConv2d.cpp140
-rw-r--r--src/cpu/operators/CpuConv2d.h40
-rw-r--r--src/cpu/operators/CpuConvertFullyConnectedWeights.cpp13
-rw-r--r--src/cpu/operators/CpuConvertFullyConnectedWeights.h8
-rw-r--r--src/cpu/operators/CpuCopy.cpp3
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2d.cpp157
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2d.h86
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp29
-rw-r--r--src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h17
-rw-r--r--src/cpu/operators/CpuDequantize.cpp1
-rw-r--r--src/cpu/operators/CpuDirectConv2d.cpp50
-rw-r--r--src/cpu/operators/CpuDirectConv2d.h24
-rw-r--r--src/cpu/operators/CpuDirectConv3d.cpp27
-rw-r--r--src/cpu/operators/CpuDirectConv3d.h16
-rw-r--r--src/cpu/operators/CpuElementwise.cpp18
-rw-r--r--src/cpu/operators/CpuElementwise.h5
-rw-r--r--src/cpu/operators/CpuElementwiseUnary.cpp5
-rw-r--r--src/cpu/operators/CpuElementwiseUnary.h3
-rw-r--r--src/cpu/operators/CpuFill.cpp3
-rw-r--r--src/cpu/operators/CpuFill.h1
-rw-r--r--src/cpu/operators/CpuFlatten.cpp6
-rw-r--r--src/cpu/operators/CpuFloor.cpp3
-rw-r--r--src/cpu/operators/CpuFullyConnected.cpp225
-rw-r--r--src/cpu/operators/CpuFullyConnected.h52
-rw-r--r--src/cpu/operators/CpuGemm.cpp198
-rw-r--r--src/cpu/operators/CpuGemm.h66
-rw-r--r--src/cpu/operators/CpuGemmConv2d.cpp378
-rw-r--r--src/cpu/operators/CpuGemmConv2d.h78
-rw-r--r--src/cpu/operators/CpuGemmDirectConv2d.cpp85
-rw-r--r--src/cpu/operators/CpuGemmDirectConv2d.h17
-rw-r--r--src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp365
-rw-r--r--src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h17
-rw-r--r--src/cpu/operators/CpuGemmLowpOutputStage.cpp52
-rw-r--r--src/cpu/operators/CpuGemmLowpOutputStage.h6
-rw-r--r--src/cpu/operators/CpuMatMul.cpp113
-rw-r--r--src/cpu/operators/CpuMatMul.h34
-rw-r--r--src/cpu/operators/CpuMaxUnpooling.cpp13
-rw-r--r--src/cpu/operators/CpuMaxUnpooling.h8
-rw-r--r--src/cpu/operators/CpuMul.cpp27
-rw-r--r--src/cpu/operators/CpuMul.h25
-rw-r--r--src/cpu/operators/CpuPermute.cpp5
-rw-r--r--src/cpu/operators/CpuPool2d.cpp35
-rw-r--r--src/cpu/operators/CpuPool2d.h11
-rw-r--r--src/cpu/operators/CpuPool3d.cpp6
-rw-r--r--src/cpu/operators/CpuPool3d.h3
-rw-r--r--src/cpu/operators/CpuQuantize.cpp1
-rw-r--r--src/cpu/operators/CpuReshape.cpp7
-rw-r--r--src/cpu/operators/CpuReshape.h5
-rw-r--r--src/cpu/operators/CpuScale.cpp128
-rw-r--r--src/cpu/operators/CpuScale.h9
-rw-r--r--src/cpu/operators/CpuSoftmax.cpp101
-rw-r--r--src/cpu/operators/CpuSoftmax.h6
-rw-r--r--src/cpu/operators/CpuSub.cpp17
-rw-r--r--src/cpu/operators/CpuSub.h13
-rw-r--r--src/cpu/operators/CpuTranspose.cpp5
-rw-r--r--src/cpu/operators/CpuWinogradConv2d.cpp263
-rw-r--r--src/cpu/operators/CpuWinogradConv2d.h62
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp393
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.h55
68 files changed, 2300 insertions, 1436 deletions
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
index 197e9850b9..44d70cf503 100644
--- a/src/cpu/operators/CpuActivation.cpp
+++ b/src/cpu/operators/CpuActivation.cpp
@@ -24,6 +24,7 @@
#include "src/cpu/operators/CpuActivation.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/IOperator.h"
#include "src/common/utils/LegacySupport.h"
#include "src/common/utils/Log.h"
@@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con
_kernel = std::move(k);
}
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+Status
+CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
{
return kernels::CpuActivationKernel::validate(input, output, activation_info);
}
@@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
}
-std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src,
+ const AclTensorDescriptor &dst,
+ const AclActivationDescriptor &act,
+ bool is_validate)
{
TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
auto info = detail::convert_to_activation_info(act);
- if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+ if (is_validate &&
+ !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
{
return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
}
@@ -69,7 +75,7 @@ std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTenso
act_op->configure(&src_info, &dst_info, info);
auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
- if(op == nullptr)
+ if (op == nullptr)
{
ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
return std::make_tuple(nullptr, StatusCode::OutOfMemory);
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
index e21fc7d32c..ec442f92c8 100644
--- a/src/cpu/operators/CpuActivation.h
+++ b/src/cpu/operators/CpuActivation.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ACTIVATION_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
index 41def8e22f..53cd7fa1b7 100644
--- a/src/cpu/operators/CpuAdd.cpp
+++ b/src/cpu/operators/CpuAdd.cpp
@@ -23,17 +23,20 @@
*/
#include "src/cpu/operators/CpuAdd.h"
-#include "src/cpu/kernels/CpuAddKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
namespace arm_compute
{
namespace cpu
{
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAdd::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
@@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
_kernel = std::move(k);
}
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAdd::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuAddKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
index db05c100cc..5f60102de2 100644
--- a/src/cpu/operators/CpuAdd.h
+++ b/src/cpu/operators/CpuAdd.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ADD_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -55,14 +56,22 @@ public:
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*
*/
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuAdd::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp
index 590ee482ca..2f19f2f842 100644
--- a/src/cpu/operators/CpuAddMulAdd.cpp
+++ b/src/cpu/operators/CpuAddMulAdd.cpp
@@ -21,39 +21,49 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/kernels/CpuAddMulAddKernel.h"
-#include "src/cpu/operators/CpuAddMulAdd.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
namespace arm_compute
{
namespace cpu
{
-void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- ITensorInfo *add_output, ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAdd::configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ ITensorInfo *add_output,
+ ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
const DataType data_type = input1->data_type();
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
_dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul);
_dequantize_bn_add.configure(bn_add, &_dequantized_bn_add);
- k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info);
+ k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy,
+ act_info);
// Save auxilary memory requirements after configuration
- _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size());
- _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size());
+ _aux_mem[DequantizedBnMul] =
+ experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary,
+ _dequantized_bn_mul.total_size());
+ _aux_mem[DequantizedBnAdd] =
+ experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary,
+ _dequantized_bn_add.total_size());
}
else
{
@@ -63,13 +73,17 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input
_kernel = std::move(k);
}
-Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAdd::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
const DataType data_type = input1->data_type();
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32);
TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32);
@@ -77,11 +91,13 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *inpu
ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul));
ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add));
- return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info);
+ return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add,
+ add_output, final_output, policy, act_info);
}
else
{
- return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+ return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy,
+ act_info);
}
}
@@ -89,37 +105,32 @@ void CpuAddMulAdd::run(ITensorPack &tensors)
{
const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
- if(is_data_type_quantized(data_type))
+ if (is_data_type_quantized(data_type))
{
const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2);
const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3);
- CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true);
- CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true);
+ CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors,
+ true);
+ CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors,
+ true);
- ITensorPack dequantize_mul_pack =
- {
- { TensorType::ACL_SRC_0, bn_mul },
- { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() }
- };
+ ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul},
+ {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}};
- ITensorPack dequantize_add_pack =
- {
- { TensorType::ACL_SRC_0, bn_add },
- { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() }
- };
+ ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add},
+ {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}};
_dequantize_bn_mul.run(dequantize_mul_pack);
_dequantize_bn_add.run(dequantize_add_pack);
- ITensorPack add_mul_add_pack =
- {
- { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) },
- { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
- { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() },
- { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() },
- { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) },
- { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) },
+ ITensorPack add_mul_add_pack = {
+ {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)},
+ {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)},
+ {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()},
+ {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()},
+ {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)},
+ {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)},
};
NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack);
diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h
index cf1ece68f1..47db75c37e 100644
--- a/src/cpu/operators/CpuAddMulAdd.h
+++ b/src/cpu/operators/CpuAddMulAdd.h
@@ -42,20 +42,28 @@ public:
* Similar to @ref NEAddMulAdd::configure()
*
*/
- void configure(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- ITensorInfo *add_output, ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info);
+ void configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ ITensorInfo *add_output,
+ ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuAddMulAdd::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
- const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
- const ITensorInfo *add_output, const ITensorInfo *final_output,
- ConvertPolicy policy, const ActivationLayerInfo &act_info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -77,7 +85,7 @@ private:
TensorInfo _dequantized_bn_mul{};
TensorInfo _dequantized_bn_add{};
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
index 1cfd8c1d0e..55b9204d71 100644
--- a/src/cpu/operators/CpuCast.cpp
+++ b/src/cpu/operators/CpuCast.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuCast.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
index 4021fd8ded..5f517a8fcb 100644
--- a/src/cpu/operators/CpuConcatenate.cpp
+++ b/src/cpu/operators/CpuConcatenate.cpp
@@ -23,21 +23,20 @@
*/
#include "src/cpu/operators/CpuConcatenate.h"
-#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
namespace arm_compute
{
@@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
unsigned int offset = 0;
- for(unsigned int i = 0; i < _num_srcs; ++i)
+ for (unsigned int i = 0; i < _num_srcs; ++i)
{
- switch(axis)
+ switch (axis)
{
case Window::DimX:
{
@@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect
}
}
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+Status
+CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
unsigned int offset = 0;
- for(const auto &src : srcs_vector)
+ for (const auto &src : srcs_vector)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- switch(axis)
+ switch (axis)
{
case Window::DimX:
{
@@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
offset += src->dimension(axis);
}
- if(dst->total_size() != 0)
+ if (dst->total_size() != 0)
{
TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec
void CpuConcatenate::run(ITensorPack &tensors)
{
- if(tensors.empty())
+ if (tensors.empty())
{
ARM_COMPUTE_ERROR("No inputs provided");
}
- if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+ if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
{
ARM_COMPUTE_ERROR("Configured with different number of inputs");
}
int i = 0;
- for(auto &k : _concat_kernels)
+ for (auto &k : _concat_kernels)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index eb11926b48..c36977c70f 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -68,8 +68,8 @@ public:
private:
std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
- unsigned int _num_srcs{ 0 };
- unsigned int _axis{ 0 };
+ unsigned int _num_srcs{0};
+ unsigned int _axis{0};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 16ac16b3ba..19311733db 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -22,8 +22,10 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuConv2d.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/operators/CpuDirectConv2d.h"
#include "src/cpu/operators/CpuGemm.h"
@@ -35,26 +37,35 @@ namespace arm_compute
{
namespace cpu
{
-CpuConv2d::CpuConv2d()
- : _function()
+CpuConv2d::CpuConv2d() : _function()
{
}
CpuConv2d::~CpuConv2d() = default;
-void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuConv2d::configure(ITensorInfo *input,
+ ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
- enable_fast_math, num_groups));
+ ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
+ act_info, enable_fast_math, num_groups));
- ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+ switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
{
@@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso
_aux_mem = _function->workspace();
}
-Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuConv2d::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+ switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM:
- ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info,
+ dilation, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM_CONV2D:
ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
@@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights,
return Status{};
}
-ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
ARM_COMPUTE_UNUSED(weights_info);
@@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
- const std::vector<ConfigurationMethod> known_configs =
- {
+ const std::vector<ConfigurationMethod> known_configs = {
// Alexnet
- ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+ PadStrideInfo(1U, 1U, 2U, 2U)),
+ ConvolutionMethod::GEMM),
// VGG16 / VGG19
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionMethod::GEMM),
// Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+ ConfigurationMethod(
+ ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+ PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+ ConvolutionMethod::GEMM),
// Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
- };
+ ConfigurationMethod(
+ ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+ PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+ ConvolutionMethod::GEMM)};
const auto find_config = [&](ConfigurationMethod c)
{
const ConvolutionConfiguration config = c.first;
const PadStrideInfo info = std::get<3>(config);
- return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+ std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+ std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+ info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+ info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+ info.stride() == conv_info.stride();
};
std::vector<ConfigurationMethod>::const_iterator found;
- if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
{
return (*found).second;
}
- if(dilation != Size2D(1U, 1U))
+ if (dilation != Size2D(1U, 1U))
{
return ConvolutionMethod::GEMM;
}
@@ -173,43 +211,49 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
{
// SRGAN
// Output might not be initialized when it is an internal tensor of the layer using the convolution
- if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
- && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+ if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) &&
+ (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
{
return ConvolutionMethod::DIRECT;
}
- if(input->dimension(idx_c) < 16)
+ if (input->dimension(idx_c) < 16)
{
return ConvolutionMethod::GEMM;
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// This heuristics only applies to F16 data type on A55r1
- if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+ if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
+ input->data_type() == DataType::F16)
{
// Exclude known bad winograd configs (and defaults to GEMM)
- const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
- {
+ const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
// Squeezenet_V1_1 fire2 and fire3
- ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
// Squeezenet_V1_1 fire6 and fire7
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
// Squeezenet_V1_1 fire8 and fire9
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+ ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U),
+ PadStrideInfo(1U, 1U, 1U, 1U)),
};
const auto find_conv_config = [&](ConvolutionConfiguration c)
{
const PadStrideInfo info = std::get<3>(c);
- return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+ return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+ std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+ std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+ info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+ info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+ info.stride() == conv_info.stride();
};
- bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
- find_conv_config)
- != known_bad_winograd_f16_with_fastmath_configs.end();
- if(found_bad)
+ bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(),
+ known_bad_winograd_f16_with_fastmath_configs.end(),
+ find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end();
+ if (found_bad)
{
return ConvolutionMethod::GEMM;
}
@@ -217,16 +261,16 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// For 1x1 convolutions run the default GEMM
- if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+ if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
{
return ConvolutionMethod::GEMM;
}
- if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+ if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
{
return ConvolutionMethod::WINOGRAD;
}
- if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+ if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
{
return ConvolutionMethod::GEMM_CONV2D;
}
diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
index 0908ac0cbb..71b9e15dc1 100644
--- a/src/cpu/operators/CpuConv2d.h
+++ b/src/cpu/operators/CpuConv2d.h
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -102,17 +103,32 @@ public:
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ void configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
*
* Similar to CpuConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
- unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will return the convolution called by @ref CpuConv2d
*
* @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -132,11 +148,17 @@ public:
*
* @return the Convolution Method Hint
*/
- static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+ static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
index 810ffb1e4e..49e31926e3 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
@@ -24,6 +24,7 @@
#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
@@ -31,7 +32,10 @@ namespace arm_compute
{
namespace cpu
{
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
+ ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
@@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI
_kernel = std::move(k);
}
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout)
{
return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
}
@@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors)
{
NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
index ea70eee134..e208cca3a0 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
@@ -41,14 +41,18 @@ public:
* @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
* @param[in] data_layout The data layout the weights have been trained in.
*/
- void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ void
+ configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuConvertFullyConnectedWeights::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const TensorShape &original_src_shape,
+ DataLayout data_layout);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
};
diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
index 7420ff6240..92c19d4df2 100644
--- a/src/cpu/operators/CpuCopy.cpp
+++ b/src/cpu/operators/CpuCopy.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuCopy.h"
-#include "src/cpu/kernels/CpuCopyKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCopyKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
index 884fe5c4ed..54075f2afa 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -24,10 +24,11 @@
#include "src/cpu/operators/CpuDepthwiseConv2d.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -37,11 +38,16 @@ namespace cpu
{
namespace
{
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments_optimized(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- if(!is_data_type_quantized_per_channel(weights->data_type()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
+ if (!is_data_type_quantized_per_channel(weights->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
}
@@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
- info.pad_stride_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
- info.pad_stride_info.pad_bottom());
-
- if(biases != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) >
+ src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+ info.pad_stride_info.pad_right());
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) >
+ src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+ info.pad_stride_info.pad_bottom());
+
+ if (biases != nullptr)
{
- const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
}
@@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w
ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
// Validate Activation Layer
- if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+ if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
}
@@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
- dst, info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
_is_quantized = is_data_type_quantized_asymmetric(src->data_type());
_has_bias = biases != nullptr;
@@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
_are_weights_const = weights->are_values_constant();
// Configure pipeline
- _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+ _is_activationlayer_enabled =
+ info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
_dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_input = std::make_unique<cpu::CpuPermute>();
_permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI
}
// Configure activation
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<cpu::CpuActivation>();
_activationlayer_function->configure(dst, nullptr, info.act_info);
@@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
// Permute input
- if(_permute)
+ if (_permute)
{
ITensorPack pack;
auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
}
// Run assembly function
- if(_is_nchw)
+ if (_is_nchw)
{
auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
}
// Permute output
- if(_is_nchw)
+ if (_is_nchw)
{
ITensorPack pack;
auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t
}
// Run activation
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
{
// if weights are not constant then we need to repack so that weights
// can be updated in-place
- if(!_are_weights_const)
+ if (!_are_weights_const)
{
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
return;
}
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
// Permute weights
- if(_permute)
+ if (_permute)
{
auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac
}
}
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
- dst, info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
_is_nchw = src->data_layout() == DataLayout::NCHW;
_is_prepared = !_is_nchw;
@@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
auto input_perm = std::make_unique<TensorInfo>();
auto weights_perm = std::make_unique<TensorInfo>();
- auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+ auto output_perm = std::make_unique<TensorInfo>(
+ dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_input = std::make_unique<cpu::CpuPermute>();
_permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
_depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
_depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
- if(_is_nchw)
+ if (_is_nchw)
{
_permute_output = std::make_unique<cpu::CpuPermute>();
_permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
@@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src,
//Configure Activation Layer
_is_activationlayer_enabled = info.act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<cpu::CpuActivation>();
_activationlayer_function->configure(dst, nullptr, info.act_info);
}
}
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
const ConvolutionInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- if(src->data_layout() == DataLayout::NCHW)
+ if (src->data_layout() == DataLayout::NCHW)
{
TensorShape permuted_input_shape = src->tensor_shape();
TensorShape permuted_weights_shape = weights->tensor_shape();
- TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+ TensorShape permuted_output_shape =
+ misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
- const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
- const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+ const TensorInfo permuted_input = TensorInfo(src->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_input_shape)
+ .set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_weights = TensorInfo(weights->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_weights_shape)
+ .set_data_layout(DataLayout::NHWC));
+ const TensorInfo permuted_output = TensorInfo(dst->clone()
+ ->set_is_resizable(true)
+ .reset_padding()
+ .set_tensor_shape(permuted_output_shape)
+ .set_data_layout(DataLayout::NCHW));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(
+ &permuted_input, &permuted_weights, biases, &permuted_output, info));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
}
// Validate Activation Layer
- if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+ if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
}
@@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
- if(_is_nchw)
+ if (_is_nchw)
{
prepare(tensors);
auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0);
@@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
- NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+ pack_depth);
}
else
{
@@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
pack_depth.add_tensor(TensorType::ACL_DST, dst);
- NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+ NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+ pack_depth);
}
- if(_is_nchw)
+ if (_is_nchw)
{
ITensorPack pack;
auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
_permute_output->run(pack);
}
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors)
void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors
}
}
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
- _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
- switch(_depth_conv_func)
+ _depth_conv_func =
+ get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+ switch (_depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_optimized.configure(src, weights, biases, dst, info);
@@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights,
}
}
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
- switch(depth_conv_func)
+ switch (depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
@@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w
}
}
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
const ConvolutionInfo &info)
{
- if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+ if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
{
return DepthwiseConvolutionFunction::OPTIMIZED;
}
@@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi
void CpuDepthwiseConv2d::run(ITensorPack &tensors)
{
- switch(_depth_conv_func)
+ switch (_depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_optimized.run(tensors);
@@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors)
void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
{
- switch(_depth_conv_func)
+ switch (_depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_func_optimized.prepare(tensors);
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
index 3d8719ee44..7eaa0df857 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.h
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h
@@ -24,8 +24,9 @@
#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
-#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -56,14 +57,22 @@ public:
* Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
*
* @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -76,7 +85,10 @@ public:
*
* @return a Depthwise Convolution Function
*/
- static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
const ConvolutionInfo &info);
// Inherited methods overriden:
@@ -118,32 +130,40 @@ private:
* @param[out] dst Destination tensor info. Data type supported: same as @p src.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
// Inherited methods overriden:
void run(ITensorPack &tensors) override;
void prepare(ITensorPack &tensors) override;
private:
- std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
- std::unique_ptr<CpuPermute> _permute_input{ nullptr };
- std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
- std::unique_ptr<CpuPermute> _permute_output{ nullptr };
- std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
- bool _has_bias{ false };
- bool _is_quantized{ false };
- bool _is_nchw{ true };
- bool _permute{ false };
- bool _is_activationlayer_enabled{ false };
- bool _is_prepared{ false };
- bool _are_weights_const{ true };
+ std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr};
+ std::unique_ptr<CpuPermute> _permute_input{nullptr};
+ std::unique_ptr<CpuPermute> _permute_weights{nullptr};
+ std::unique_ptr<CpuPermute> _permute_output{nullptr};
+ std::unique_ptr<CpuActivation> _activationlayer_function{nullptr};
+ bool _has_bias{false};
+ bool _is_quantized{false};
+ bool _is_nchw{true};
+ bool _permute{false};
+ bool _is_activationlayer_enabled{false};
+ bool _is_prepared{false};
+ bool _are_weights_const{true};
};
/** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
@@ -176,7 +196,11 @@ private:
* Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
@@ -184,24 +208,28 @@ private:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
void prepare(ITensorPack &tensors) override;
private:
- std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
- std::unique_ptr<CpuPermute> _permute_input{ nullptr };
- std::unique_ptr<CpuPermute> _permute_weights{ nullptr };
- std::unique_ptr<CpuPermute> _permute_output{ nullptr };
- std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr };
- bool _is_nchw{ true };
- bool _is_prepared{ false };
- bool _is_activationlayer_enabled{ false };
+ std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr};
+ std::unique_ptr<CpuPermute> _permute_input{nullptr};
+ std::unique_ptr<CpuPermute> _permute_weights{nullptr};
+ std::unique_ptr<CpuPermute> _permute_output{nullptr};
+ std::unique_ptr<CpuActivation> _activationlayer_function{nullptr};
+ bool _is_nchw{true};
+ bool _is_prepared{false};
+ bool _is_activationlayer_enabled{false};
};
- DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
+ DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC};
CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
CpuDepthwiseConv2dGeneric _func_generic{};
};
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index d078155155..8d3741de96 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -38,15 +39,14 @@ namespace cpu
{
struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
{
- std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
- bool is_prepared{ false };
- bool are_weights_const{ true };
+ std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr};
+ bool is_prepared{false};
+ bool are_weights_const{true};
experimental::MemoryRequirements mem_req{};
};
#ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
- : _pImpl(std::make_unique<LocalImpl>())
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>())
{
}
#endif /* DOXYGEN_SKIP_THIS */
@@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
_pImpl->are_weights_const = weights->are_values_constant();
// If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+ if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
{
return;
}
@@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src,
// Compute memory requirements for assembly kernels
constexpr size_t alignment = 4096;
- _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment });
- _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+ _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment});
+ _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment});
_pImpl->asm_kernel = std::move(dwc_wrapper);
}
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info)
{
return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
}
@@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
{
const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
+ if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
{
// Pack weights and bias
const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors)
const auto weights_padding = weights->info()->padding();
const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
- const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+ const size_t ld_weights_row =
+ ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
_pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
weights->mark_as_unused();
- if(bias != nullptr)
+ if (bias != nullptr)
{
bias->mark_as_unused();
}
diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index f222ab9cf9..f1816625d2 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -53,14 +54,22 @@ public:
* @param[out] dst Destination tensor info. Data type supported: same as @p src.
* @param[in] info Depthwise convolution meta-data.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const ConvolutionInfo &info);
/** Checks if activation is supported by the assembly kernels
*
* @param[in] activation Activation to check
@@ -70,8 +79,8 @@ public:
static bool is_activation_supported(const ActivationLayerInfo &activation);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
index 12dc136ba3..c05a23f3a7 100644
--- a/src/cpu/operators/CpuDequantize.cpp
+++ b/src/cpu/operators/CpuDequantize.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuDequantizeKernel.h"
diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
index 9cdbdb61c1..135a3bb2b9 100644
--- a/src/cpu/operators/CpuDirectConv2d.cpp
+++ b/src/cpu/operators/CpuDirectConv2d.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
namespace arm_compute
@@ -36,12 +37,25 @@ namespace cpu
CpuDirectConv2d::~CpuDirectConv2d() = default;
CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
- _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+ : _memory_group(std::move(memory_manager)),
+ _output_stage_kernel(),
+ _conv_kernel(),
+ _input_border_handler(),
+ _activationlayer_function(),
+ _accumulator(),
+ _has_bias(false),
+ _is_activationlayer_enabled(false),
+ _dim_split(Window::DimZ),
+ _is_padding_required()
{
}
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CpuDirectConv2d::configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info);
@@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
_input_border_handler = std::make_unique<NEFillBorderKernel>();
// Free accumulator
- if(_accumulator.buffer() != nullptr)
+ if (_accumulator.buffer() != nullptr)
{
_accumulator.allocator()->free();
}
@@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT
_has_bias = (bias != nullptr);
_conv_kernel->configure(src, weights, dst, conv_info);
- if(_has_bias)
+ if (_has_bias)
{
_output_stage_kernel->configure(dst, bias);
}
_is_padding_required = !_conv_kernel->border_size().empty();
- if(_is_padding_required)
+ if (_is_padding_required)
{
// Add zero padding XY
- _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+ _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT,
+ PixelValue(static_cast<float>(0.f)));
}
//Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<CpuActivation>();
_activationlayer_function->configure(dst, dst, act_info);
}
}
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+Status CpuDirectConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
// Validate Convolution kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
- if(bias != nullptr)
+ if (bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
@@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig
// Validate bias kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
- if(act_info.enabled())
+ if (act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
}
@@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
auto dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_is_padding_required)
+ if (_is_padding_required)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_DST, src);
- NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+ NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(),
+ pack);
}
NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
- if(_has_bias)
+ if (_has_bias)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, dst);
@@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
}
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
index fa8d61e083..73c85f2dcd 100644
--- a/src/cpu/operators/CpuDirectConv2d.h
+++ b/src/cpu/operators/CpuDirectConv2d.h
@@ -24,13 +24,14 @@
#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
#define ARM_COMPUTE_CPU_DIRECTCONV2D_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
@@ -75,14 +76,23 @@ public:
* @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation.
*/
- void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensorInfo *src,
+ ITensorInfo *weights,
+ const ITensorInfo *bias,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDirectConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
@@ -95,10 +105,10 @@ private:
std::unique_ptr<NEFillBorderKernel> _input_border_handler;
std::unique_ptr<CpuActivation> _activationlayer_function;
Tensor _accumulator;
- bool _has_bias{ false };
- bool _is_activationlayer_enabled{ false };
- unsigned int _dim_split{ 0 };
- bool _is_padding_required{ false };
+ bool _has_bias{false};
+ bool _is_activationlayer_enabled{false};
+ unsigned int _dim_split{0};
+ bool _is_padding_required{false};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp
index aa74e420a6..626f1c6775 100644
--- a/src/cpu/operators/CpuDirectConv3d.cpp
+++ b/src/cpu/operators/CpuDirectConv3d.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
namespace arm_compute
@@ -36,11 +37,17 @@ namespace cpu
CpuDirectConv3d::~CpuDirectConv3d() = default;
CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+ : _memory_group(std::move(memory_manager)),
+ _conv_kernel(),
+ _activationlayer_function(),
+ _accumulator(),
+ _is_activationlayer_enabled(false),
+ _dim_split(Window::DimZ)
{
}
-void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
+void CpuDirectConv3d::configure(
+ ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
{
ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info);
ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
@@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
_conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>();
// Free accumulator
- if(_accumulator.buffer() != nullptr)
+ if (_accumulator.buffer() != nullptr)
{
_accumulator.allocator()->free();
}
@@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen
//Configure Activation Layer
_is_activationlayer_enabled = conv_info.act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activationlayer_function = std::make_unique<CpuActivation>();
_activationlayer_function->configure(dst, dst, conv_info.act_info);
}
}
-Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info)
+Status CpuDirectConv3d::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo conv_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
// Validate Convolution kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info));
- if(conv_info.act_info.enabled())
+ if (conv_info.act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info));
}
@@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors)
}
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
index cde01f07c2..3ad1e09a14 100644
--- a/src/cpu/operators/CpuDirectConv3d.h
+++ b/src/cpu/operators/CpuDirectConv3d.h
@@ -24,14 +24,15 @@
#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
#define ARM_COMPUTE_CPU_DIRECTCONV3D_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
@@ -76,14 +77,19 @@ public:
* The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
* @param[in] conv_info Contains padding, stride, acitvation information.
*/
- void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
+ void configure(
+ ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuDirectConv3d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info);
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const Conv3dInfo conv_info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -93,8 +99,8 @@ private:
std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel;
std::unique_ptr<CpuActivation> _activationlayer_function;
Tensor _accumulator;
- bool _is_activationlayer_enabled{ false };
- unsigned int _dim_split{ 0 };
+ bool _is_activationlayer_enabled{false};
+ unsigned int _dim_split{0};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
index b88ae3e514..c2ae8773c6 100644
--- a/src/cpu/operators/CpuElementwise.cpp
+++ b/src/cpu/operators/CpuElementwise.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuElementwise.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/CpuElementwiseKernel.h"
@@ -33,7 +34,7 @@ namespace cpu
void CpuElementwiseBase::run(ITensorPack &tensors)
{
// If the kernel has been configured, use the window from the kernel.
- if(_kernel->is_window_configured())
+ if (_kernel->is_window_configured())
{
ICpuOperator::run(tensors);
return;
@@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, con
}
template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
{
return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
}
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
+void CpuElementwiseComparison::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ComparisonOperation op)
{
ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
auto k = std::make_unique<kernels::CpuComparisonKernel>();
@@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI
_kernel = std::move(k);
}
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
+Status CpuElementwiseComparison::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ComparisonOperation op)
{
return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
}
@@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>
template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
index b6c61cf245..5db53c8026 100644
--- a/src/cpu/operators/CpuElementwise.h
+++ b/src/cpu/operators/CpuElementwise.h
@@ -139,7 +139,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+ static Status
+ validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
};
/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
@@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqua
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
index 7fd14dba7d..04ab7bf8f5 100644
--- a/src/cpu/operators/CpuElementwiseUnary.cpp
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuElementwiseUnary.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
@@ -47,7 +48,7 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src
void CpuElementwiseUnary::run(ITensorPack &tensors)
{
- if(_kernel->is_window_configured())
+ if (_kernel->is_window_configured())
{
ICpuOperator::run(tensors);
return;
@@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors)
ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
index 5e8e98d047..1e51bfaa1c 100644
--- a/src/cpu/operators/CpuElementwiseUnary.h
+++ b/src/cpu/operators/CpuElementwiseUnary.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
#include "arm_compute/core/Types.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -56,4 +57,4 @@ public:
} // namespace cpu
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
index 3d8f62fe07..1890d0b916 100644
--- a/src/cpu/operators/CpuFill.cpp
+++ b/src/cpu/operators/CpuFill.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuFill.h"
-#include "src/cpu/kernels/CpuFillKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFillKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
index 41d9a9fa8a..cb83745d29 100644
--- a/src/cpu/operators/CpuFill.h
+++ b/src/cpu/operators/CpuFill.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_FILL_H
#include "arm_compute/core/PixelValue.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
index 7bab9e481c..2609d44590 100644
--- a/src/cpu/operators/CpuFlatten.cpp
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -23,16 +23,14 @@
*/
#include "src/cpu/operators/CpuFlatten.h"
-#include "src/cpu/operators/CpuReshape.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuReshape.h"
namespace arm_compute
{
namespace cpu
{
-CpuFlatten::CpuFlatten()
- : _reshape(nullptr)
+CpuFlatten::CpuFlatten() : _reshape(nullptr)
{
}
diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
index 868add7d29..a107393b01 100644
--- a/src/cpu/operators/CpuFloor.cpp
+++ b/src/cpu/operators/CpuFloor.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuFloor.h"
-#include "src/cpu/kernels/CpuFloorKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
namespace arm_compute
{
diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
index 395d8d2aa5..85a0b0311b 100644
--- a/src/cpu/operators/CpuFullyConnected.cpp
+++ b/src/cpu/operators/CpuFullyConnected.cpp
@@ -25,10 +25,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/MemoryHelpers.h"
@@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
{
const auto data_type = src->data_type();
const QuantizationInfo oq_info = dst->quantization_info();
@@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
int32_t output_multiplier;
int32_t output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- int32_t type_min = 0;
- int32_t type_max = 0;
+ int32_t type_min = 0;
+ int32_t type_max = 0;
std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
return Status{};
}
-Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format)
+Status validate_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act,
+ bool enable_fast_math,
+ WeightFormat weight_format)
{
- if(is_data_type_quantized_asymmetric(src->data_type()))
+ if (is_data_type_quantized_asymmetric(src->data_type()))
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate src and weights offset
- const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
- const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+ const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+ -src->quantization_info().uniform().offset);
+ const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+ -weights->quantization_info().uniform().offset);
GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
@@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe
// Validate gemmlowp function
TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
- ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
- &weights_info,
- biases,
- dst,
- gemm_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info));
}
else
{
@@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected()
CpuFullyConnected::~CpuFullyConnected() = default;
-void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
- if(_is_quantized_asymmetric)
+ if (_is_quantized_asymmetric)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate src and weights offset
- const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
- const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+ const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+ -src->quantization_info().uniform().offset);
+ const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+ -weights->quantization_info().uniform().offset);
TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info);
TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
// Configure gemmlowp function and output stage for asymmetric quantized types
GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
- const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+ const Status status =
+ get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
GEMMInfo gemm_info;
@@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *
}
}
-void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
@@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI
configure_mm(&_flattened_src, weights, biases, dst, act);
}
-void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
@@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf
configure_mm(src, weights, biases, dst, act);
}
-void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void CpuFullyConnected::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
- weights,
- biases != nullptr ? biases : nullptr,
- dst,
- fc_info,
- weights_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info));
ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
_needs_weights_conversion = false;
@@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
// Check if we have a fully connected layer with batches
const bool is_batched_fc_layer = dst->dimension(1) > 1;
- if(is_batched_fc_layer)
+ if (is_batched_fc_layer)
{
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+ _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
}
else
{
@@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Reshape weights if needed
- if(_needs_weights_reshape)
+ if (_needs_weights_reshape)
{
// Reshape the weights
_transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
@@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Convert weights if needed
- if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
{
// Convert weights
_convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
- _convert_weights->configure(weights_to_use,
- &_converted_weights,
- src->tensor_shape(),
+ _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(),
fc_info.weights_trained_layout);
_converted_weights.set_are_values_constant(weights_to_use->are_values_constant());
@@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
_trans_weights_idx = AuxTensorIdx::ConvertedWeights;
}
- if(_is_fc_after_conv)
+ if (_is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
@@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei
}
// Retain the tensorinfo with the weights to use
- if(_needs_weights_reshape || _needs_weights_conversion)
+ if (_needs_weights_reshape || _needs_weights_conversion)
{
_trans_weights = *weights_to_use;
}
// Set auxiliary memory requirements
auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
- for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+ for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
{
_aux_mem[i] = gemm_mem_req[i];
}
- if(_aux_mem[Pretranspose].size > 0)
+ if (_aux_mem[Pretranspose].size > 0)
{
// Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
// Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
// Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time.
_aux_mem[TransposedWeights] = MemoryInfo(
offset_int_vec(TransposedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary :
- (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent :
- MemoryLifetime::Prepare,
+ _dynamic_weights ? MemoryLifetime::Temporary
+ : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent
+ : MemoryLifetime::Prepare,
_reshaped_weights.total_size());
- _aux_mem[ConvertedWeights] = MemoryInfo(
- offset_int_vec(ConvertedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
- _converted_weights.total_size());
+ _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+ _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+ _converted_weights.total_size());
}
else
{
- _aux_mem[TransposedWeights] = MemoryInfo(
- offset_int_vec(TransposedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary :
- _needs_weights_conversion ? MemoryLifetime::Prepare :
- MemoryLifetime::Persistent,
- _reshaped_weights.total_size());
+ _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+ _dynamic_weights ? MemoryLifetime::Temporary
+ : _needs_weights_conversion ? MemoryLifetime::Prepare
+ : MemoryLifetime::Persistent,
+ _reshaped_weights.total_size());
_aux_mem[ConvertedWeights] = MemoryInfo(
- offset_int_vec(ConvertedWeights),
- _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
+ offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
_converted_weights.total_size());
}
- _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+ _aux_mem[FlattenedSrc] =
+ MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
}
-Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
- const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info)
+Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ WeightsInfo weights_info)
{
GEMMInfo gemm_info;
gemm_info.set_activation_info(fc_info.activation_info);
@@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh
return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
}
-Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status CpuFullyConnected::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F16, DataType::F32);
if (is_fixed_format_fast_math(weights_info.weight_format()))
{
@@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
}
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
- && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+ fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+ fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
bool is_fc_after_conv = true;
- const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
- const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
- const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+ const ITensorInfo &flatten_src =
+ TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+ const ITensorInfo &reshaped_weights = TensorInfo(
+ weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+ const ITensorInfo &converted_weights = weights_reshaped
+ ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+ : TensorInfo(*reshaped_weights.clone());
// With the Fully Connected layer we can have 4 different cases:
// 1) Convolution layer -> Fully Connected layer without batches
@@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
// Check if we have a fully connected layer with batches
const bool is_batched_fc_layer = dst->dimension(1) > 1;
- if(biases != nullptr)
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- if(is_data_type_quantized(src->data_type()))
+ if (is_data_type_quantized(src->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
@@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
}
}
- if(is_batched_fc_layer)
+ if (is_batched_fc_layer)
{
- is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+ is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+ (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+ dst->tensor_shape().cbegin() + 1));
}
else
{
is_fc_after_conv = src->num_dimensions() > 1;
}
- if(!weights_reshaped)
+ if (!weights_reshaped)
{
// Validate reshape weights kernel
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
weights_to_use = &reshaped_weights;
}
- if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+ if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
{
// Validate convert weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
- &converted_weights,
- src->tensor_shape(),
- fc_info.weights_trained_layout));
+ ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(
+ weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
weights_to_use = &converted_weights;
}
- if(is_fc_after_conv)
+ if (is_fc_after_conv)
{
// Fully Connected layer after a Convolution Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
// Validate flatten kernel
ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
@@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we
ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
}
// Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format()));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info,
+ fc_info.enable_fast_math, weights_info.weight_format()));
return Status{};
}
@@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors)
CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
// Linearize src if it comes from a convolutional layer
- if(_is_fc_after_conv)
+ if (_is_fc_after_conv)
{
- ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+ ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
_flatten->run(flatten_pack);
}
ITensorPack gemm_pack = tensors;
gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
- if(_needs_weights_reshape || _needs_weights_conversion)
+ if (_needs_weights_reshape || _needs_weights_conversion)
{
gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
}
// Run matrix multiply
- if(_is_quantized_asymmetric)
+ if (_is_quantized_asymmetric)
{
_mm_gemmlowp->run(gemm_pack);
}
@@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors)
void CpuFullyConnected::prepare(ITensorPack &tensors)
{
- if(!_is_prepared || _dynamic_weights)
+ if (!_is_prepared || _dynamic_weights)
{
#ifdef ARM_COMPUTE_ASSERTS_ENABLED
++_asrt_prepare_count;
@@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
const ITensor *cur_weights = weights;
// Reshape of the weights (happens only once)
- if(_needs_weights_reshape)
+ if (_needs_weights_reshape)
{
// Run reshape weights kernel and mark weights as unused
- ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
- NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
+ ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+ NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(),
+ transpose_pack);
cur_weights->mark_as_unused();
cur_weights = reshaped_weights.get();
}
// Convert weights if needed (happens only once)
- if(_needs_weights_conversion)
+ if (_needs_weights_conversion)
{
- ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+ ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
_convert_weights->run(convert_pack);
cur_weights->mark_as_unused();
@@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors)
gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
// Prepare GEMM prepare and release unused weights
- if(!_is_quantized_asymmetric)
+ if (!_is_quantized_asymmetric)
{
_mm_gemm->prepare(gemm_pack);
}
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
index 1e8c6478d0..7073fb9f7c 100644
--- a/src/cpu/operators/CpuFullyConnected.h
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -24,11 +24,11 @@
#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-#include "src/cpu/ICpuOperator.h"
-
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+#include "src/cpu/ICpuOperator.h"
+
#include <memory>
namespace arm_compute
@@ -86,16 +86,24 @@ public:
* @param[in] fc_info (Optional) Fully connected layer additional info
* @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
*
* Similar to @ref CpuFullyConnected::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
+ const WeightsInfo &weights_info = WeightsInfo());
/** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
* weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -103,19 +111,35 @@ public:
*
* @return a status
*/
- static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
- const ITensorInfo *biases, const ITensorInfo *dst,
- FullyConnectedLayerInfo fc_info, WeightsInfo weights_info);
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ FullyConnectedLayerInfo fc_info,
+ WeightsInfo weights_info);
//Inherited methods override
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
- void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
- void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
- void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+ void configure_fc_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act);
+ void configure_conv_fc(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act);
+ void configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act);
enum AuxTensorIdx
{
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 34b845928d..8da166dbef 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -24,9 +24,10 @@
#include "src/cpu/operators/CpuGemm.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -57,17 +58,25 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
}
} // namespace
-void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void CpuGemm::configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool is_c_bias = beta == 1 && c != nullptr;
- bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
- (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
- !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+ const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool is_c_bias = beta == 1 && c != nullptr;
+ bool run_optimised =
+ bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
+ (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+ !(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
// Check if we need to reshape the matrix B only on the first run
_is_prepared = false;
@@ -76,9 +85,12 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
_run_alpha_scale = alpha != 1.f;
_run_bias_addition = is_c_bias;
_run_addition = beta != 0 && beta != 1 && c != nullptr;
- _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+ _run_activation =
+ gemm_info.activation_info().enabled() &&
+ (!run_optimised ||
+ (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
- if(run_optimised)
+ if (run_optimised)
{
const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
@@ -90,10 +102,11 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
_aux_mem[Pretraspose] = asm_mem_req[Pretraspose];
// Scale product by alpha
- if(_run_alpha_scale)
+ if (_run_alpha_scale)
{
_alpha_scale_func = std::make_unique<cpu::CpuActivation>();
- _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+ _alpha_scale_func->configure(
+ d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
}
}
else
@@ -104,7 +117,7 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
_mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
// Select between GEMV and GEMM
- if(_run_vector_matrix_multiplication)
+ if (_run_vector_matrix_multiplication)
{
// Configure the matrix multiply kernel
_mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
@@ -118,41 +131,50 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso
// Configure interleave kernel
_interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
_interleave_kernel->configure(a, &_tmp_a);
- _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[InterleavedLHS] =
+ MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
// Configure transpose kernel
_transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
_transpose_kernel->configure(b, &_tmp_b);
- _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+ _aux_mem[TransposedRHS] =
+ MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
// Configure matrix multiplication kernel
_mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
}
- if(_run_bias_addition)
+ if (_run_bias_addition)
{
_add_bias = std::make_unique<cpu::CpuAdd>();
_add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
- _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+ _aux_mem[TempResult] =
+ MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
}
}
// Configure matrix addition kernel
- if(_run_addition)
+ if (_run_addition)
{
_ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
_ma_kernel->configure(c, d, beta);
}
// Configure activation
- if(_run_activation)
+ if (_run_activation)
{
_activation_func = std::make_unique<cpu::CpuActivation>();
_activation_func->configure(d, nullptr, gemm_info.activation_info());
}
}
-Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CpuGemm::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_UNUSED(alpha);
const bool is_c_bias = beta == 1 && c != nullptr;
@@ -162,7 +184,7 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(is_fixed_format_fast_math(gemm_info.weight_format()))
+ if (is_fixed_format_fast_math(gemm_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -174,46 +196,54 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
const int block_by = arm_compute::block_by(gemm_info.weight_format());
// test if im2col has changed the dimensions that are needed for padding
- if(a->dimension(0) != b->dimension(1) && block_by > 1)
+ if (a->dimension(0) != b->dimension(1) && block_by > 1)
{
// have to verify bias
const size_t dim0_sz = a->dimension(0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (dim0_sz % block_by) != 0,
+ ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
// a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
// b->dimension(1) = kernel_area * input_channel
// a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right;
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+ "The product AB is defined only if A number of columns and B number of rows are related");
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ a->dimension(0) != b->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
}
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- if(a->data_type() != DataType::BFLOAT16)
+ if (a->data_type() != DataType::BFLOAT16)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
}
- if(run_addition)
+ if (run_addition)
{
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
+ "The C matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+ "The C matrix must have the same number of columns as the matrix B");
}
- if(d->total_size() != 0)
+ if (d->total_size() != 0)
{
// For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if (gemm_info.depth_output_gemm3d() != 0)
{
- if(gemm_info.reinterpret_input_as_3d())
+ if (gemm_info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
@@ -230,15 +260,19 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
}
// Check if we need to run the optimized assembly kernel
- cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
- (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
- !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
-
- if(!run_optimised)
+ cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool run_optimised =
+ bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
+ (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+ !(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+
+ if (!run_optimised)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(),
+ "CpuGemm cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0,
+ "CpuGemm cannot reinterpret the output tensor as 3D");
// Check if the first input tensor is a vector.
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
@@ -254,7 +288,8 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
int mult_transpose1xW_width = 1;
int mult_interleave4x4_height = 1;
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+ const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(
+ m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
@@ -263,39 +298,44 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens
TensorInfo tmp_b_info{};
TensorInfo tmp_output_info = *d->clone();
- if(run_interleave_transpose)
+ if (run_interleave_transpose)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+ auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(
+ *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
// Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+ auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
+ *b, mult_transpose1xW_width)));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
}
// Validate matrix multiply
- auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+ auto_init_if_empty(tmp_output_info,
+ matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(
+ *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(
+ matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
- if(is_c_bias)
+ if (is_c_bias)
{
ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
}
}
// Validate matrix addition kernel
- if(run_addition)
+ if (run_addition)
{
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
}
// Validate activation
const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
+ if (activation.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
}
@@ -312,15 +352,15 @@ void CpuGemm::run(ITensorPack &tensors)
auto c = tensors.get_const_tensor(ACL_SRC_2);
auto d = tensors.get_tensor(ACL_DST);
- if(_asm_glue && _asm_glue->is_configured())
+ if (_asm_glue && _asm_glue->is_configured())
{
// Pass c to asm dispatch only if it's the bias tensor
ITensorPack asm_pack = tensors;
asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr);
_asm_glue->run(asm_pack);
- if(_run_alpha_scale)
+ if (_run_alpha_scale)
{
- ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
_alpha_scale_func->run(pack);
}
}
@@ -330,18 +370,20 @@ void CpuGemm::run(ITensorPack &tensors)
CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
- ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
- if(!_run_vector_matrix_multiplication)
+ ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
+ if (!_run_vector_matrix_multiplication)
{
// Run interleave kernel
- ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
- NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
+ ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
+ NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
+ interleave_pack);
- if(!_reshape_b_only_on_first_run)
+ if (!_reshape_b_only_on_first_run)
{
// Run transpose kernel
- ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+ ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+ transpose_pack);
}
// Use reshaped matrices
@@ -349,48 +391,52 @@ void CpuGemm::run(ITensorPack &tensors)
mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
}
- NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
+ NEScheduler::get().schedule_op(_mm_kernel.get(),
+ _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
+ _mm_kernel->window(), mm_pack);
// Run bias addition kernel
- if(_run_bias_addition)
+ if (_run_bias_addition)
{
- ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
+ ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}};
_add_bias->run(pack);
}
}
// Run matrix addition kernel
- if(_run_addition)
+ if (_run_addition)
{
- ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
+ ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}};
NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
}
// Run activation function
- if(_run_activation)
+ if (_run_activation)
{
- ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+ ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
_activation_func->run(pack);
}
}
void CpuGemm::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
- if(_asm_glue && _asm_glue->is_configured())
+ if (_asm_glue && _asm_glue->is_configured())
{
_asm_glue->prepare(tensors);
}
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+ else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
{
- const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
+ const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *b_aux =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
- ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+ ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+ transpose_pack);
}
_is_prepared = true;
}
@@ -401,8 +447,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const
return _aux_mem;
}
-Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- const GEMMInfo &gemm_info)
+Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const GEMMInfo &gemm_info)
{
const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
index 9b08e5d0f6..6b30d134fa 100644
--- a/src/cpu/operators/CpuGemm.h
+++ b/src/cpu/operators/CpuGemm.h
@@ -24,12 +24,12 @@
#ifndef ARM_COMPUTE_CPU_GEMM_H
#define ARM_COMPUTE_CPU_GEMM_H
-#include "src/cpu/ICpuOperator.h"
-
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
@@ -93,16 +93,26 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should happen only for the first run
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
*
* Similar to @ref CpuGemm::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
*
@@ -111,12 +121,16 @@ public:
* the value of arm_compute::WeightFormat need to be passed via the
* parameter gemm_info.
*/
- static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- const GEMMInfo &gemm_info = GEMMInfo());
+ static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
/** Indicates if the convolution executes in variable weights mode.
@@ -138,28 +152,28 @@ private:
Count
};
- std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{ nullptr };
- std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{ nullptr };
- std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
- std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr };
- std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
- std::unique_ptr<CpuActivation> _alpha_scale_func{ nullptr };
- std::unique_ptr<CpuAdd> _add_bias{ nullptr };
- std::unique_ptr<CpuActivation> _activation_func{ nullptr };
+ std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{nullptr};
+ std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{nullptr};
+ std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr};
+ std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
+ std::unique_ptr<CpuActivation> _alpha_scale_func{nullptr};
+ std::unique_ptr<CpuAdd> _add_bias{nullptr};
+ std::unique_ptr<CpuActivation> _activation_func{nullptr};
TensorInfo _tmp_a{};
TensorInfo _tmp_b{};
TensorInfo _tmp_d{};
- bool _run_vector_matrix_multiplication{ false };
- bool _run_alpha_scale{ false };
- bool _run_addition{ false };
- bool _run_bias_addition{ false };
- bool _run_activation{ false };
- bool _reshape_b_only_on_first_run{ false };
- bool _is_prepared{ false };
+ bool _run_vector_matrix_multiplication{false};
+ bool _run_alpha_scale{false};
+ bool _run_addition{false};
+ bool _run_bias_addition{false};
+ bool _run_activation{false};
+ bool _reshape_b_only_on_first_run{false};
+ bool _is_prepared{false};
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 39b410d609..7c59d88c61 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -26,9 +26,9 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
@@ -52,8 +52,11 @@ namespace arm_compute
{
namespace cpu
{
-CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info)
+CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info)
{
const DataLayout data_layout = src->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -62,63 +65,86 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src,
const unsigned int kernel_height = weights->dimension(idx_height);
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
- const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- if(skip_im2col)
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
+ const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+ conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+ if (skip_im2col)
{
- const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
- if(skip_col2im)
+ const bool skip_col2im =
+ (data_layout == DataLayout::NHWC &&
+ (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
+ if (skip_col2im)
{
- return { true, true };
+ return {true, true};
}
}
else
{
- const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
- if(skip_col2im)
+ const bool skip_col2im =
+ (data_layout == DataLayout::NHWC &&
+ (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
+ if (skip_col2im)
{
- return { false, true };
+ return {false, true};
}
}
// Default case when we cannot reinterpret the input and output as 3D.
- return { false, false };
+ return {false, false};
}
CpuGemmConv2d::CpuGemmConv2d()
- : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
- _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+ : _weights_reshape_kernel(nullptr),
+ _im2col_kernel(),
+ _mm_gemm(),
+ _mm_gemmlowp(),
+ _col2im_kernel(),
+ _reshape(),
+ _im2col_output(),
+ _weights_reshaped(),
+ _gemm_output(),
+ _gemm_output_3d(),
+ _data_layout(DataLayout::NCHW),
+ _skip_im2col(false),
+ _skip_col2im(false),
+ _is_quantized(false),
+ _is_prepared(false),
+ _aux_mem(AuxTensorIdx::Count)
{
}
CpuGemmConv2d::~CpuGemmConv2d() = default;
-void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
- bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format)
+void CpuGemmConv2d::configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ int gemm_3d_depth,
+ bool fixed_format,
+ arm_compute::WeightFormat weight_format)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
+ _skip_im2col, fixed_format, weight_format));
// Create GEMMInfo structure
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+ const GEMMInfo &gemm_info =
+ GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+ _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+ false, enable_fast_math, false, act_info, fixed_format, weight_format);
// Supported activations in GEMM
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
- if(_is_quantized)
+ if (_is_quantized)
{
- TensorInfo tmp_src{ *src };
- TensorInfo tmp_weights{ *weights };
+ TensorInfo tmp_src{*src};
+ TensorInfo tmp_weights{*weights};
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
const QuantizationInfo iqinfo = src->quantization_info();
@@ -129,7 +155,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
const DataType data_type = src->data_type();
tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
- if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+ if (!is_data_type_quantized_per_channel(tmp_weights.data_type()))
{
const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
@@ -142,7 +168,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
int32_t min_activation = type_min.get<int32_t>();
int32_t max_activation = type_max.get<int32_t>();
- if(supported_acts.count(act_info.activation()) != 0)
+ if (supported_acts.count(act_info.activation()) != 0)
{
std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
}
@@ -156,11 +182,12 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
- _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
- weight_format));
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
+ GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
+ enable_fast_math, false, act_info, fixed_format, weight_format));
auto mm_mem_req = _mm_gemmlowp->workspace();
- for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
{
_aux_mem[cont] = mm_mem_req[cont];
}
@@ -171,26 +198,35 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig
_mm_gemm = std::make_unique<CpuGemm>();
_mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
auto mm_mem_req = _mm_gemm->workspace();
- for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
{
_aux_mem[cont] = mm_mem_req[cont];
}
}
}
-Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format)
+Status CpuGemmConv2d::validate_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ int gemm_3d_depth,
+ bool skip_im2col,
+ bool fixed_format,
+ arm_compute::WeightFormat weight_format)
{
const DataType data_type = src->data_type();
const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
const bool is_activation_enabled = act_info.enabled();
// Create GEMMInfo structure
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+ const GEMMInfo gemm_info =
+ GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+ skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+ false, enable_fast_math, false, act_info, fixed_format, weight_format);
- if(is_quantized)
+ if (is_quantized)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
@@ -206,11 +242,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
int32_t min_activation = type_min.get<int32_t>();
int32_t max_activation = type_max.get<int32_t>();
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+ if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
{
std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
}
@@ -229,8 +264,9 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
- return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math,
- false, act_info));
+ return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
+ GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
+ output_info, false, enable_fast_math, false, act_info));
}
else
{
@@ -239,36 +275,44 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei
}
}
-Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info,
+ const ITensorInfo *weights_info,
+ const ActivationLayerInfo &act_info,
+ int gemm_3d_depth,
+ bool skip_im2col)
{
const DataType data_type = input_info->data_type();
const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
// Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type,
+ input_info->quantization_info());
const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type,
+ input_info->quantization_info());
- return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
+ return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false,
+ gemm_3d_depth, skip_im2col);
}
-void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuGemmConv2d::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_UNUSED(num_groups, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
- weights,
- biases,
- dst,
- conv_info,
- weights_info,
- dilation,
- act_info,
- enable_fast_math,
- num_groups));
- ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation,
+ act_info, enable_fast_math, num_groups));
+ ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math,
+ num_groups);
const DataType data_type = src->data_type();
const DataLayout data_layout = src->data_layout();
@@ -283,7 +327,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
_is_prepared = weights_info.retain_internal_weights();
_is_quantized = is_data_type_quantized_asymmetric(src->data_type());
_data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+ conv_info.stride().first == 1 && conv_info.stride().second == 1);
const ITensorInfo *gemm_input_to_use = src;
ITensorInfo *gemm_output_to_use = dst;
@@ -291,20 +336,17 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
// Get convolved dimensions
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
"Output shape does not match the expected one");
// Check if GEMM3D is supported
- const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
- _skip_im2col = skip_info.skip_im2col;
- _skip_col2im = skip_info.skip_col2im;
+ const CpuGemmConv2d::SkipInfo skip_info =
+ CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+ _skip_im2col = skip_info.skip_im2col;
+ _skip_col2im = skip_info.skip_col2im;
// Get parameters from conv_info
unsigned int stride_x = 0;
@@ -320,17 +362,19 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
_weights_reshaped.set_quantization_info(weights->quantization_info());
// Create tensor to store im2col reshaped inputs
- if(!_skip_im2col)
+ if (!_skip_im2col)
{
const int block_by = arm_compute::block_by(weights_info.weight_format());
unsigned int input_pad_right = 0;
- if(block_by > 1)
+ if (block_by > 1)
{
- input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+ input_pad_right =
+ (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
}
// Configure
_im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
- _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right);
+ _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation,
+ num_groups, input_pad_right);
// Update GEMM input
gemm_input_to_use = &_im2col_output;
@@ -338,7 +382,7 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
// Create temporary GEMM output tensor in case we cannot skip col2im
const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!_skip_col2im)
+ if (!_skip_col2im)
{
TensorShape shape_gemm;
@@ -368,9 +412,10 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
// In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
- configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format());
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
+ gemm_3d_depth, fixed_format, weights_info.weight_format());
- if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+ if (!_skip_col2im && _data_layout == DataLayout::NCHW)
{
// Configure col2im
_col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
@@ -390,14 +435,24 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights
gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
// Check lifetime
- _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
- _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
- _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+ _aux_mem[Im2ColOutput] =
+ MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+ _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+ gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+ _weights_reshaped.total_size());
+ _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
}
-Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ const bool enable_fast_math)
{
const DataLayout data_layout = src->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -406,36 +461,44 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
const unsigned int kernel_height = weights->dimension(idx_height);
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
- const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
- dilation, act_info);
+ const CpuGemmConv2d::SkipInfo skip_info =
+ CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
const bool skip_im2col = skip_info.skip_im2col;
const bool skip_col2im = skip_info.skip_col2im;
const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
+ const GEMMInfo gemm_info =
+ GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+ skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+ false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
}
-Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuGemmConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+ DataType::F16, DataType::F32);
- if(!is_fixed_format(weights_info.weight_format()))
+ if (!is_fixed_format(weights_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
}
@@ -468,29 +531,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
unsigned int conv_w = 0;
unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
- src->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+ kernel_height, conv_info, dilation);
// Check if GEMM3D is supported
- const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
- dilation, act_info);
- const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
+ const CpuGemmConv2d::SkipInfo skip_info =
+ CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+ const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
- if(is_quantized)
+ if (is_quantized)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
- else if(is_bf16)
+ else if (is_bf16)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
}
@@ -503,20 +562,23 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
}
unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+ unsigned int mat_weights_rows =
+ weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type());
weights_reshaped_info.set_quantization_info(weights->quantization_info());
weights_to_use = &weights_reshaped_info;
- if(!skip_im2col)
+ if (!skip_im2col)
{
const int block_by = arm_compute::block_by(weights_info.weight_format());
int input_pad_right = 0;
- if(block_by > 1)
+ if (block_by > 1)
{
- input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
- mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right);
+ input_pad_right =
+ (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+ mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) *
+ (weights->dimension(idx_channel) + input_pad_right);
}
// Create tensor info for im2col reshaped inputs
@@ -528,13 +590,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
im2col_reshaped_info.set_quantization_info(src->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height),
+ conv_info, append_bias, dilation, num_groups, input_pad_right));
gemm_input_to_use = &im2col_reshaped_info;
}
// Create temporary GEMM output tensor in case we cannot skip col2im
const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!skip_col2im)
+ if (!skip_col2im)
{
TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
shape_gemm.set(0, mat_weights_cols);
@@ -549,13 +613,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight
gemm_output_to_use = &info_gemm;
const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
+ enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
weights_info.weight_format()));
// Validate Col2Im/ReshapeLayer
- if(!skip_col2im && (data_layout == DataLayout::NCHW))
+ if (!skip_col2im && (data_layout == DataLayout::NCHW))
{
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
}
return Status{};
@@ -574,15 +640,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
- if(!_skip_im2col)
+ if (!_skip_im2col)
{
// Run input reshaping
unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, src },
- { TensorType::ACL_DST, im2col_output.get() }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
gemm_input_to_use = im2col_output.get();
}
@@ -595,11 +657,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
gemm3d.allocator()->import_memory(out_to_use->buffer());
auto gemm_output_to_use = gemm_output.get();
- if(_skip_im2col)
+ if (_skip_im2col)
{
gemm_output_to_use = &gemm3d;
}
- if(_skip_col2im && !out_has_padding)
+ if (_skip_col2im && !out_has_padding)
{
gemm_output_to_use = dst;
}
@@ -607,12 +669,12 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
// Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
ITensorPack pack_mm = tensors;
pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
- if(!this->isVarWeightsKernel())
+ if (!this->isVarWeightsKernel())
{
pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
}
pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
- if(_is_quantized)
+ if (_is_quantized)
{
// Run gemmlowp
_mm_gemmlowp->run(pack_mm);
@@ -624,45 +686,33 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
}
// Reshape output matrix
- if(!_skip_col2im)
+ if (!_skip_col2im)
{
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, gemm_output.get() },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}};
NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
}
else
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, gemm_output_to_use },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
_reshape->run(pack);
}
}
- else if(out_has_padding)
+ else if (out_has_padding)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, gemm_output_to_use },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
_reshape->run(pack);
}
}
void CpuGemmConv2d::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Variable weights executions that use fixed-format kernels
// need no reshaping of the weights.
- if(this->isVarWeightsKernel())
+ if (this->isVarWeightsKernel())
{
_is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
_is_prepared = true;
@@ -672,11 +722,7 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors)
// Run weights reshaping and mark original weights tensor as unused
CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, weights },
- { TensorType::ACL_DST, weights_reshaped.get() }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
weights->mark_as_unused();
ITensorPack gemm_pack = tensors;
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 61fe63a79f..118d366517 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -27,6 +27,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
#include <memory>
@@ -106,17 +107,32 @@ public:
* available which may introduce a drop of accuracy as well. Default is false
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
- const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmConvolution::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false, unsigned int num_groups = 1);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ unsigned int num_groups = 1);
/** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
*
@@ -124,10 +140,16 @@ public:
*
* @return a status.
*/
- static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- const bool enable_fast_math = false);
+ static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U),
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ const bool enable_fast_math = false);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
@@ -150,8 +172,15 @@ private:
* @param[in] fixed_format (Optional) Select GEMM execution with variable weights.
* @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
*/
- void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+ void configure_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *output,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ int gemm_3d_depth = 1,
+ bool fixed_format = false,
+ arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
*
* @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -170,8 +199,16 @@ private:
*
* @return a status
*/
- static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+ static Status validate_mm(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ bool enable_fast_math = false,
+ int gemm_3d_depth = 1,
+ bool skip_im2col = false,
+ bool fixed_format = false,
+ arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
/** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
*
* @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -182,7 +219,11 @@ private:
*
* @return a status
*/
- static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+ static Status validate_gemm3d(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ActivationLayerInfo &act_info,
+ int gemm_3d_depth,
+ bool skip_im2col);
struct SkipInfo
{
@@ -200,8 +241,11 @@ private:
*
* @return a SkipInfo instance.
*/
- static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info);
+ static SkipInfo skip_im_col_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info);
/** Indicates if the convolution executes in variable weights mode.
*
@@ -236,7 +280,7 @@ private:
bool _is_quantized;
bool _is_prepared;
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index 5ce285cb6f..8fa81b1907 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -26,10 +26,10 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
-
#include "support/Cast.h"
#include <set>
@@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast;
namespace
{
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act)
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
@@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src,
const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
const DataType data_type = src->data_type();
// Merge activation with output stage
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- PixelValue type_min{};
- PixelValue type_max{};
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+ ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+ PixelValue type_min{};
+ PixelValue type_max{};
std::tie(type_min, type_max) = get_min_max(data_type);
int32_t min_activation = type_min.get<int32_t>();
int32_t max_activation = type_max.get<int32_t>();
- if(supported_acts.count(act.activation()) != 0)
+ if (supported_acts.count(act.activation()) != 0)
{
std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
}
@@ -107,31 +109,32 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d()
CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const Conv2dInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
- weights,
- biases != nullptr ? biases : nullptr,
- dst,
- info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info));
ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
_run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
_is_prepared = false;
- _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
+ _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2});
// Configure assembly dispatch
cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
- if(is_data_type_quantized(src->data_type()))
+ if (is_data_type_quantized(src->data_type()))
{
asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
}
_gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
// Configure activation
- if(_run_activation)
+ if (_run_activation)
{
_activation_func->configure(dst, nullptr, info.act_info);
}
@@ -141,24 +144,33 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w
_aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
_aux_mem[Pretranspose] = asm_mem_req[Pretranspose];
- if(_aux_mem[Pretranspose].size > 0)
+ if (_aux_mem[Pretranspose].size > 0)
{
// Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+ _aux_mem[PermutedWeights] =
+ MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
}
else
{
// We must permute weights if they are WeightFormat::UNSPECIFIED
- if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+ if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
+ _aux_mem[PermutedWeights] =
+ MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
}
}
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv2dInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(!is_fixed_format(info.weights_info.weight_format()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+ DataType::F16, DataType::F32);
+ if (!is_fixed_format(info.weights_info.weight_format()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
}
@@ -171,13 +183,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *
ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
- if(is_data_type_quantized_asymmetric(data_type))
+ if (is_data_type_quantized_asymmetric(data_type))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
}
- else if(data_type == DataType::BFLOAT16)
+ else if (data_type == DataType::BFLOAT16)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
}
@@ -198,31 +210,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors)
prepare(tensors);
_gemm_asm_func->run(tensors);
- if(_run_activation)
+ if (_run_activation)
{
ITensor *io = tensors.get_tensor(ACL_DST);
- ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } };
+ ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}};
_activation_func->run(pack);
}
}
void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// If we are using fixed-format kernel the weights are already reshaped
- if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
+ if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
{
_gemm_asm_func->prepare(tensors);
_is_prepared = true;
return;
}
- const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+ const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *weights_aux =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
- ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+ ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
_weights_permute_func->run(permute_tensors);
tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index e55a461f36..1cc3caadae 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
#include "arm_compute/core/TensorInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/operators/CpuActivation.h"
@@ -69,18 +70,26 @@ public:
* Data types supported: Same as @p input.
* @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const Conv2dInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
*
* Similar to CpuGemmDirectConv2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const Conv2dInfo &info);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8ca128fb07..2ee879b67b 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -28,14 +28,14 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
@@ -59,12 +59,12 @@ namespace
cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
{
cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
- asm_info.output_stage = info.gemmlowp_output_stage();
- asm_info.fast_mode = info.fast_math();
+ asm_info.method = cpu::AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+ asm_info.output_stage = info.gemmlowp_output_stage();
+ asm_info.fast_mode = info.fast_math();
return asm_info;
}
@@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
}
CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
-void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+void CpuGemmLowpMatrixMultiplyCore::configure(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
@@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
_reshape_b_only_on_first_run = b->are_values_constant();
_is_prepared = false;
_fused_assembly_path = false;
- _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
- _gemm_info = gemm_info;
+ _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
+ _reshape_b_only_on_first_run;
+ _gemm_info = gemm_info;
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
const ITensorInfo *a_to_use = a;
// Convert to QASYMM8 -> QASYMM8_SIGNED and back
- if(_flip_signedness)
+ if (_flip_signedness)
{
const int32_t offset_correction = 128;
const DataType dt = DataType::QASYMM8_SIGNED;
const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
- _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
_convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
_convert_to_signed_asymm->configure(a_to_use, &_signed_a);
a_to_use = &_signed_a;
_a_offset = _signed_a.quantization_info().uniform().offset;
const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
- _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+ _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
// Output stage correction
GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
_fuse_output_stage = true;
_mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
@@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
// Initialize assembly kernel meta-data
const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
#ifdef __aarch64__
- if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+ if (!(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
{
- switch(a->data_type())
+ switch (a->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
case DataType::U8:
case DataType::S8:
{
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
auto c_info_to_use = c == nullptr ? nullptr : c;
_asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
@@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
}
#endif /* __aarch64__ */
- if(!(_assembly_path || _run_vector_matrix_multiplication))
+ if (!(_assembly_path || _run_vector_matrix_multiplication))
{
matrix_a = &_tmp_a;
matrix_b = &_tmp_b;
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+ _tmp_a =
+ TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
_tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
@@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
_mtx_b_reshape_kernel->configure(b, &_tmp_b);
}
- if(!_fused_assembly_path)
+ if (!_fused_assembly_path)
{
// Build reduction info
const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
// Initialize matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
+ if (_a_offset != 0)
{
_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
@@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
+ if (_b_offset != 0)
{
_vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
@@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
_mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
}
- if(_fuse_output_stage)
+ if (_fuse_output_stage)
{
// Configure matrix multiply kernel
- if(!_assembly_path)
+ if (!_assembly_path)
{
_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
_mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
}
- _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
- _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
- _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
- _flip_signedness ? &_signed_output : dst,
- a->dimension(0),
- _a_offset, _b_offset, info.gemmlowp_output_stage());
+ _offset_contribution_output_stage_kernel =
+ std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+ _offset_contribution_output_stage_kernel->configure(
+ &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,
+ a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
- if(_flip_signedness)
+ if (_flip_signedness)
{
_convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
_convert_from_signed_asymm->configure(&_signed_output, dst);
@@ -267,27 +273,29 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
else
{
// Configure matrix multiply kernel
- if(!_assembly_path)
+ if (!_assembly_path)
{
_mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
_mm_kernel->configure(matrix_a, matrix_b, dst);
}
// Configure offset contribution kernel
_offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
- _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+ _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
_a_offset, _b_offset);
}
}
// Configure activation
const ActivationLayerInfo &activation = gemm_info.activation_info();
- _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
- if(_run_activation)
+ _run_activation =
+ activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+ if (_run_activation)
{
_activation_func = std::make_unique<CpuActivation>();
_activation_func->configure(dst, nullptr, activation);
}
- if(_assembly_path)
+ if (_assembly_path)
{
auto asm_mem_req = _asm_glue->workspace();
_aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
@@ -295,27 +303,41 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso
}
// Request memory for LHS and RHS reshape matrix
- _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
- && _reshape_b_only_on_first_run ?
- MemoryLifetime::Persistent :
- MemoryLifetime::Temporary,
- _vector_sum_col.total_size());
- _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
- _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
- _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
- _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
- _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
- _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+ _aux_mem[VectorSumCol] =
+ MemoryInfo(offset_int_vec(VectorSumCol),
+ !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
+ : MemoryLifetime::Temporary,
+ _vector_sum_col.total_size());
+ _aux_mem[VectorSumRow] =
+ MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+ _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+ _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
+ _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+ _tmp_b.total_size());
+ _aux_mem[MMResultS32] =
+ MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+ _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+ _aux_mem[SignedOutput] =
+ MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
}
-Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&
+ gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+ "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (a)->dimension(0) != (b)->dimension(1),
+ "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
@@ -333,28 +355,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
int32_t b_offset = b->quantization_info().uniform().offset;
bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
- if(fuse_output_stage)
+ if (fuse_output_stage)
{
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ auto_init_if_empty(mm_result_s32_info,
+ a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
}
// Convert QASYMM8->QASYMM8_SIGNED
TensorInfo signed_a{};
TensorInfo signed_output{};
- bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
- if(flip_signedness)
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
+ (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if (flip_signedness)
{
const int32_t offset_correction = 128;
const DataType dt = DataType::QASYMM8_SIGNED;
const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
- signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
a_to_use = &signed_a;
a_offset = signed_a.quantization_info().uniform().offset;
const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
- signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(
+ QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
// Output stage correction
GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -374,25 +400,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
bool run_optimised = false;
bool run_optimised_requantized = false;
- if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+ if (!(!b->are_values_constant() &&
+ b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
{
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+ info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
run_optimised_requantized = run_optimised;
}
else
{
- run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+ run_optimised = bool(CpuGemmAssemblyDispatch::validate(
+ a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
}
}
- if(run_optimised)
+ if (run_optimised)
{
ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(info.depth_output_gemm3d() != 0)
+ if (info.depth_output_gemm3d() != 0)
{
- if(info.reinterpret_input_as_3d())
+ if (info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -409,11 +438,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+ "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+ "NEGEMM cannot reinterpret the output tensor as 3D");
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- if(!run_vector_matrix_multiplication)
+ if (!run_vector_matrix_multiplication)
{
matrix_a_info = &tmp_a_info;
matrix_b_info = &tmp_b_info;
@@ -437,7 +468,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
}
}
- if(!run_optimised_requantized)
+ if (!run_optimised_requantized)
{
TensorInfo info_vector_sum_col{};
TensorInfo info_vector_sum_row{};
@@ -445,62 +476,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens
const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
// Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if(a_offset != 0)
+ if (a_offset != 0)
{
info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
// Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
}
// Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if(b_offset != 0)
+ if (b_offset != 0)
{
info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
// Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
}
- if(fuse_output_stage)
+ if (fuse_output_stage)
{
- if(!run_optimised)
+ if (!run_optimised)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.reinterpret_input_as_3d(),
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.depth_output_gemm3d() != 0,
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
+ matrix_a_info, matrix_b_info, &mm_result_s32_info));
}
// Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- flip_signedness ? &signed_output : output,
- a_offset, b_offset,
- info.gemmlowp_output_stage()));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
+ &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+ b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
+ b_offset, info.gemmlowp_output_stage()));
}
else
{
- if(!run_optimised)
+ if (!run_optimised)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.reinterpret_input_as_3d(),
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ info.depth_output_gemm3d() != 0,
+ "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
}
// Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
+ output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+ a_offset, b_offset));
}
}
// Validate activation
const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
+ if (activation.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
}
@@ -529,24 +568,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
// Convert QASYMM8->QASYMM8_SIGNED
- if(_flip_signedness)
+ if (_flip_signedness)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, a },
- { TensorType::ACL_DST, signed_a.get() }
- };
- NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
+ NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
+ pack);
a_to_use = signed_a.get();
matrix_a = signed_a.get();
}
// Run GEMM
- if(_asm_glue->is_configured())
+ if (_asm_glue->is_configured())
{
ITensorPack asm_glue_tensors = tensors;
auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst);
- if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
+ _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
@@ -563,35 +600,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
}
else
{
- if(!_run_vector_matrix_multiplication)
+ if (!_run_vector_matrix_multiplication)
{
matrix_a = tmp_a.get();
matrix_b = tmp_b.get();
// Run interleave kernel
- ITensorPack pack_a =
- {
- { TensorType::ACL_SRC, a_to_use },
- { TensorType::ACL_DST, tmp_a.get() }
- };
- NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+ ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
+ NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
+ pack_a);
- if(!_reshape_b_only_on_first_run)
+ if (!_reshape_b_only_on_first_run)
{
- ITensorPack pack_b =
- {
- { TensorType::ACL_SRC, b },
- { TensorType::ACL_DST, tmp_b.get() }
- };
+ ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
// Run transpose kernel
- NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
+ _mtx_b_reshape_kernel->window(), pack_b);
}
}
- ITensorPack pack_mm =
- {
- { TensorType::ACL_SRC_0, matrix_a },
- { TensorType::ACL_SRC_1, matrix_b }
- };
- if(_fuse_output_stage)
+ ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
+ if (_fuse_output_stage)
{
pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
}
@@ -602,31 +629,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
}
- if(!_fused_assembly_path)
+ if (!_fused_assembly_path)
{
// Run matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
+ if (_b_offset != 0)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, a_to_use },
- { TensorType::ACL_DST, vector_sum_row.get() }
- };
- NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
+ NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
+ _mtx_a_reduction_kernel->window(), pack);
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+ if (_a_offset != 0 && !_reshape_b_only_on_first_run)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, b },
- { TensorType::ACL_DST, vector_sum_col.get() }
- };
- NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+ _mtx_b_reduction_kernel->window(), pack);
}
- if(_fuse_output_stage)
+ if (_fuse_output_stage)
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
@@ -636,7 +657,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
// Run offset contribution kernel
- NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+ NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
+ _offset_contribution_output_stage_kernel->window(), pack);
}
else
{
@@ -646,68 +668,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
pack.add_tensor(TensorType::ACL_DST, dst);
// Run offset contribution kernel
- NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+ NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
+ _offset_contribution_kernel->window(), pack);
}
}
// Convert QASYMM8_SIGNED->QASYMM8
- if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+ if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, signed_output.get() },
- { TensorType::ACL_DST, dst }
- };
- NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
+ NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
+ _convert_from_signed_asymm->window(), pack);
}
// Run fused activation unless already run in the fused assembly
- if(_run_activation)
+ if (_run_activation)
{
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, dst },
- { TensorType::ACL_DST, dst }
- };
+ ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
_activation_func->run(pack);
}
}
void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
// Run assembly reshape
- if(_asm_glue->is_configured())
+ if (_asm_glue->is_configured())
{
_asm_glue->prepare(tensors);
}
// Run non-assembly reshape
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+ else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
{
// Run reshape kernel and mark original weights tensor as unused
- ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+ ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, original_b },
- { TensorType::ACL_DST, tmp_b.get() }
- };
- NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
+ NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
+ pack);
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+ if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
{
- ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+ ITensor *vector_sum_col_p =
+ utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, original_b },
- { TensorType::ACL_DST, vector_sum_col.get() }
- };
- NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+ ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+ NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+ _mtx_b_reduction_kernel->window(), pack);
}
_is_prepared = true;
}
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
index a1b34291d0..a7798938e7 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/function_info/GEMMInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -108,18 +109,26 @@ public:
* @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
* if the reshape of matrix B should be executed only for the first run
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+ void configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *dst,
+ const GEMMInfo &gemm_info = GEMMInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuGemmLowpMatrixMultiplyCore::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *dst,
+ const GEMMInfo &gemm_info = GEMMInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
index 58f98acff0..4215eed199 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
@@ -36,36 +37,42 @@ namespace arm_compute
{
namespace cpu
{
-void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void CpuGemmLowpOutputStage::configure(ITensorInfo *src,
+ ITensorInfo *bias,
+ ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &info)
{
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
- switch(info.type)
+ switch (info.type)
{
case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
{
- switch(info.output_data_type)
+ switch (info.output_data_type)
{
case DataType::QASYMM8:
{
auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+ info.gemmlowp_min_bound, info.gemmlowp_max_bound);
_kernel = std::move(k);
break;
}
case DataType::QASYMM8_SIGNED:
{
auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+ info.gemmlowp_min_bound, info.gemmlowp_max_bound);
_kernel = std::move(k);
break;
}
case DataType::QSYMM16:
{
auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound,
+ info.gemmlowp_max_bound);
_kernel = std::move(k);
break;
}
@@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
}
case GEMMLowpOutputStageType::QUANTIZE_DOWN:
{
- switch(info.output_data_type)
+ switch (info.output_data_type)
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
@@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen
}
}
-Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN,
+ "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) &&
+ (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
- switch(info.type)
+ switch (info.type)
{
case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
{
- switch(dst->data_type())
+ switch (dst->data_type())
{
case DataType::QASYMM8:
- return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+ src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
case DataType::QASYMM8_SIGNED:
- return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+ src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
case DataType::QSYMM16:
- return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+ return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+ src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
default:
return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
}
}
case GEMMLowpOutputStageType::QUANTIZE_DOWN:
{
- switch(dst->data_type())
+ switch (dst->data_type())
{
case DataType::QASYMM8:
case DataType::QASYMM8_SIGNED:
@@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
index 39394f6b5f..e5e2f41fa9 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.h
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
#include "arm_compute/core/Types.h"
+
#include "src/cpu/ICpuOperator.h"
/** This file contains all available output stages for GEMMLowp.
@@ -76,7 +77,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *bias,
+ const ITensorInfo *dst,
+ const GEMMLowpOutputStageInfo &info);
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 8811a7ea6b..89087129c3 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp
@@ -23,14 +23,16 @@
*/
#include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/function_info/MatMulInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +48,11 @@ namespace cpu
{
namespace
{
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act,
+ GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
{
const auto data_type = src->data_type();
const QuantizationInfo oq_info = dst->quantization_info();
@@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
int32_t output_multiplier;
int32_t output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
- int32_t type_min = 0;
- int32_t type_max = 0;
+ int32_t type_min = 0;
+ int32_t type_max = 0;
std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo
} // namespace
CpuMatMul::CpuMatMul()
- : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
+ : _transpose_kernel_lhs(),
+ _transpose_kernel_rhs(),
+ _asm_glue(),
+ _lhs_transposed(),
+ _rhs_transposed(),
+ _original_lhs_shape(),
+ _original_rhs_shape(),
+ _original_dst_shape()
{
}
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status CpuMatMul::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
@@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
gemm_info.fast_mode = settings.fast_math();
// Validate and then permute a/b
- if(adj_lhs)
+ if (adj_lhs)
{
- auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
+ auto_init_if_empty(lhs_transposed,
+ lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
// Assign lhs_to_use pointer to use transposed TensorInfo
lhs_to_use = &lhs_transposed;
}
- if(adj_rhs)
+ if (adj_rhs)
{
- auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
+ auto_init_if_empty(rhs_transposed,
+ rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
// Assign rhs_to_use pointer to use transposed TensorInfo
rhs_to_use = &rhs_transposed;
}
ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)");
+ "The product AB is defined only if the number of columns in A is equal to the "
+ "number of rows in B (after transpose)");
// Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
- for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
+ for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
+ "Broadcasting in Batch dimension is unsupported by this operator.");
}
// Quantized-specific configuration
- if(is_data_type_quantized(lhs->data_type()))
+ if (is_data_type_quantized(lhs->data_type()))
{
- ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage));
+ ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
+ gemm_info.activation_info, gemm_info.output_stage));
}
cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
@@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const
return Status{};
}
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CpuMatMul::configure(ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
_original_rhs_shape = rhs_to_use.tensor_shape();
// Reshape lhs for use with assembly kernels.
- lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
- dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
+ lhs_to_use.set_tensor_shape(
+ TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
+ dst_to_use.set_tensor_shape(
+ TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
// 2. Configuration for transpose of lhs/rhs
// ------------------------------------------------------
// Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
- if(_adj_lhs)
+ if (_adj_lhs)
{
// Setup transpose LHS
_transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
_transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
}
- if(_adj_rhs)
+ if (_adj_rhs)
{
// Setup transpose RHS
_transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
@@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst,
rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
// Quantized-specific configuration
- if(is_data_type_quantized(lhs->data_type()))
+ if (is_data_type_quantized(lhs->data_type()))
{
- get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage);
+ get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
+ _gemm_info.output_stage);
}
// Configure Asm Kernel
_asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
- _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul
+ _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
+ _gemm_info); // c is nullptr as bias not supported in MatMul
// Specify memory requirements for intermediate tensors
auto asm_mem_req = _asm_glue->workspace();
// Specify memory required by gemm kernel
int idx = 0;
- for(const auto &aux : asm_mem_req)
+ for (const auto &aux : asm_mem_req)
{
_aux_mem[idx] = aux;
idx++;
@@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors)
// Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
// Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
- lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
- dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+ lhs->info()->set_tensor_shape(
+ TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
+ _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+ dst->info()->set_tensor_shape(
+ TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
+ _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
// Initialise object to handle stored transposed tensors in auxillary memory
@@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors)
ITensorPack asm_tensors(tensors);
// Run transpose lhs if necessary
- if(_adj_lhs)
+ if (_adj_lhs)
{
- ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack);
+ ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
+ lhs_transpose_pack);
asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
}
// Run transpose rhs if necessary
- if(_adj_rhs)
+ if (_adj_rhs)
{
- ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } };
- NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack);
+ ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
+ NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
+ rhs_transpose_pack);
asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
}
// Run asm kernel
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 475c019fd0..24db3da346 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -25,6 +25,7 @@
#define ACL_SRC_CPU_OPERATORS_CPUMATMUL
#include "arm_compute/core/TensorInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/kernels/CpuTransposeKernel.h"
@@ -66,18 +67,27 @@ public:
* @param[in] settings The settings for matmul operation (i.e fast math)
* @param[in] act_info Class containing information about fused activation function.
*/
- void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuMatMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+ static Status validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
@@ -91,9 +101,9 @@ private:
};
// Define unique pointers to kernels/operators used by matmul
- std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr };
- std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr };
- std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr };
+ std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr};
+ std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr};
+ std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr};
// TensorInfo for tensors stored in auxillary memory
TensorInfo _lhs_transposed{};
@@ -105,13 +115,13 @@ private:
TensorShape _original_dst_shape{};
// Note : adj_lhs means the same as transposing lhs
- bool _adj_lhs{ false };
- bool _adj_rhs{ false };
- bool _fast_math{ false };
+ bool _adj_lhs{false};
+ bool _adj_rhs{false};
+ bool _fast_math{false};
AsmGemmInfo _gemm_info{};
- experimental::MemoryRequirements _aux_mem{ Count };
+ experimental::MemoryRequirements _aux_mem{Count};
};
-}
-}
+} // namespace cpu
+} // namespace arm_compute
#endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */
diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp
index 24e9fd6d46..697fc40ab3 100644
--- a/src/cpu/operators/CpuMaxUnpooling.cpp
+++ b/src/cpu/operators/CpuMaxUnpooling.cpp
@@ -22,6 +22,7 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuMaxUnpooling.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
@@ -29,7 +30,10 @@ namespace arm_compute
{
namespace cpu
{
-void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpooling::configure(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info);
auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>();
@@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic
_kernel = std::move(k);
}
-Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpooling::validate(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info)
{
return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h
index aa1f1072a5..5dc00bce9e 100644
--- a/src/cpu/operators/CpuMaxUnpooling.h
+++ b/src/cpu/operators/CpuMaxUnpooling.h
@@ -44,14 +44,18 @@ public:
* @param[out] dst Destination tensor. Data types supported: Same as @p src
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
*/
- void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+ void
+ configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuMaxUnpooling::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *indices,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info);
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
index 4c15015206..ac9847111d 100644
--- a/src/cpu/operators/CpuMul.cpp
+++ b/src/cpu/operators/CpuMul.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuMulKernel.h"
@@ -33,14 +34,24 @@ namespace arm_compute
{
namespace cpu
{
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status CpuMul::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
}
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void CpuMul::configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
@@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
}
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status CpuComplexMul::validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
}
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void CpuComplexMul::configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
@@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors)
NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
index 3e0edbf050..82b309830b 100644
--- a/src/cpu/operators/CpuMul.h
+++ b/src/cpu/operators/CpuMul.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -61,7 +62,12 @@ public:
* @param[in] rounding_policy Rounding policy.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ void configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
@@ -69,7 +75,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
@@ -89,14 +100,20 @@ public:
* @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(ITensorInfo *src1,
+ ITensorInfo *src2,
+ ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuComplexMul::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *src1,
+ const ITensorInfo *src2,
+ const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
index babaf21b6f..25acc92d00 100644
--- a/src/cpu/operators/CpuPermute.cpp
+++ b/src/cpu/operators/CpuPermute.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuPermute.h"
-#include "src/cpu/kernels/CpuPermuteKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPermuteKernel.h"
namespace arm_compute
{
@@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons
{
return kernels::CpuPermuteKernel::validate(src, dst, perm);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36ee5..b72bde6978 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuPool2dKernel.h"
#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
@@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices);
// Check if we can run assembly kernels. Currently, indices are not supported by those kernels
- const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+ const bool run_optimised =
+ bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
// Get data layout
_data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
// Check if we have Global Pooling Layer
const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
- _use_kernel_indices = pool_info.use_kernel_indices;
+ _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) &&
+ (src->dimension(idx_height) == pool_info.pool_size.height);
+ _use_kernel_indices = pool_info.use_kernel_indices;
- if(run_optimised)
+ if (run_optimised)
{
const CPUInfo &ci = NEScheduler::get().cpu_info();
const unsigned int num_threads = NEScheduler::get().num_threads();
@@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
// Get kernel's memory requirements
constexpr size_t alignment = 4096;
const size_t workspace_size = pooling_wrapper->get_working_size(num_threads);
- _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+ _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
_asm_glue = std::move(pooling_wrapper);
}
@@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer
}
}
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2d::validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
- const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+ const bool run_optimised =
+ bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
- if(run_optimised)
+ if (run_optimised)
{
return Status{};
}
@@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
- if(_asm_glue)
+ if (_asm_glue)
{
const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
}
else
{
- switch(_data_layout)
+ switch (_data_layout)
{
case DataLayout::NCHW:
- NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+ _is_global_pooling_layer ? Window::DimZ : Window::DimY,
+ _pooling_layer_kernel->window(), tensors);
break;
case DataLayout::NHWC:
- NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors);
+ NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+ (_use_kernel_indices ? Window::DimY : Window::DimX),
+ _pooling_layer_kernel->window(), tensors);
break;
default:
ARM_COMPUTE_ERROR("Data layout not supported");
diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
index 5c571db88a..ea73e3f335 100644
--- a/src/cpu/operators/CpuPool2d.h
+++ b/src/cpu/operators/CpuPool2d.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_POOL2D_H
#include "arm_compute/core/experimental/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -58,17 +59,21 @@ public:
* @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
* @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
*/
- void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+ void
+ configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
/** Static function to check if given info will lead to a valid configuration
*
* Similar to CpuPool2d::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices = nullptr);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
index 14e4ac6c97..7fa78c1f80 100644
--- a/src/cpu/operators/CpuPool3d.cpp
+++ b/src/cpu/operators/CpuPool3d.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/Scheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuPool3dKernel.h"
@@ -35,8 +36,7 @@ namespace arm_compute
{
namespace cpu
{
-CpuPool3d::CpuPool3d()
- : _aux_mem(1)
+CpuPool3d::CpuPool3d() : _aux_mem(1)
{
}
@@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const
}
} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index 8a73f8a0af..235d798095 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_POOL3D_H
#include "arm_compute/core/experimental/Types.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -61,7 +62,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index f9e14d1f88..4315499c39 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/cpu/kernels/CpuQuantizeKernel.h"
diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index e6892a2e7e..a423abb49a 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp
@@ -23,11 +23,10 @@
*/
#include "src/cpu/operators/CpuReshape.h"
-#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
namespace arm_compute
{
@@ -49,7 +48,7 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
void CpuReshape::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- if(!_is_prepared)
+ if (!_is_prepared)
{
static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
_is_prepared = true;
diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 9bc43e7db4..33da792319 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h
@@ -24,9 +24,10 @@
#ifndef ARM_COMPUTE_CPU_RESHAPE_H
#define ARM_COMPUTE_CPU_RESHAPE_H
-#include "src/cpu/ICpuOperator.h"
#include "arm_compute/core/Window.h"
+#include "src/cpu/ICpuOperator.h"
+
namespace arm_compute
{
namespace cpu
@@ -53,7 +54,7 @@ public:
void run(ITensorPack &tensors) override;
private:
- bool _is_prepared{ false } ;
+ bool _is_prepared{false};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
index 8a712bf088..7df9296931 100644
--- a/src/cpu/operators/CpuScale.cpp
+++ b/src/cpu/operators/CpuScale.cpp
@@ -24,8 +24,9 @@
#include "src/cpu/operators/CpuScale.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/utils/ScaleUtils.h"
#include "src/cpu/kernels/CpuScaleKernel.h"
@@ -37,11 +38,12 @@ namespace cpu
{
namespace
{
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(
+ ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
{
ARM_COMPUTE_ERROR_ON(offsets == nullptr);
float sampling_offset = 0.0f;
- if(sampling_policy == SamplingPolicy::CENTER)
+ if (sampling_policy == SamplingPolicy::CENTER)
{
sampling_offset = 0.5f;
}
@@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float
win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
- if(dx != nullptr && dy != nullptr)
+ if (dx != nullptr && dy != nullptr)
{
// Pre-compute the offset and pixel's distance for BILINEAR interpolation
Iterator offsets_it(offsets, win);
Iterator dx_it(dx, win);
Iterator dy_it(dy, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
- const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
- const int in_xi = std::floor(in_x);
- const int in_yi = std::floor(in_y);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
+ {
+ const float in_x = (id.x() + sampling_offset) * wr - sampling_offset;
+ const float in_y = (id.y() + sampling_offset) * hr - sampling_offset;
+ const int in_xi = std::floor(in_x);
+ const int in_yi = std::floor(in_y);
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
- *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
- *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
- },
- offsets_it, dx_it, dy_it);
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+ *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi;
+ *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi;
+ },
+ offsets_it, dx_it, dy_it);
}
else
{
// Pre-compute the offset for NEAREST interpolation
Iterator offsets_it(offsets, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const float float_in_xi = (id.x() + sampling_offset) * wr;
- const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
- *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
- },
- offsets_it);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
+ {
+ const float float_in_xi = (id.x() + sampling_offset) * wr;
+ const auto in_xi = static_cast<size_t>(
+ align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi)
+ : std::floor(float_in_xi));
+ *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+ },
+ offsets_it);
}
}
} // namespace
@@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
_is_prepared = false;
// Get data layout and width/height indices
- _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ _scale_info.align_corners &&
+ arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+ dst->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+ dst->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
- && hr <= 1.f) ?
- InterpolationPolicy::NEAREST_NEIGHBOR :
- _scale_info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : _scale_info.interpolation_policy;
// Get the tensor shape
TensorShape shape(dst->dimension(idx_width));
@@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy);
auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets);
auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
- switch(policy_to_use)
+ switch (policy_to_use)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
@@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn
Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+ ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+ info.sampling_policy != SamplingPolicy::TOP_LEFT);
ITensorInfo *offsets = nullptr;
ITensorInfo *dx = nullptr;
@@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+ dst->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+ dst->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : info.interpolation_policy;
// Get the tensor shape of auxilary buffers
const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
TensorInfo tensor_info_offsets(shape, Format::S32);
TensorInfo tensor_info_dx(shape, Format::F32);
TensorInfo tensor_info_dy(shape, Format::F32);
- switch(policy_to_use)
+ switch (policy_to_use)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
offsets = &tensor_info_offsets;
@@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const
break;
}
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
return Status{};
}
void CpuScale::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_is_prepared = true;
const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
@@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors)
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ _scale_info.align_corners &&
+ arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+ src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+ src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
- && hr <= 1.f) ?
- InterpolationPolicy::NEAREST_NEIGHBOR :
- _scale_info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : _scale_info.interpolation_policy;
const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
- bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
+ bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+ _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
- if(precompute_indices_weights)
+ if (precompute_indices_weights)
{
- switch(policy_to_use)
+ switch (policy_to_use)
{
case InterpolationPolicy::NEAREST_NEIGHBOR:
{
@@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors)
}
else
{
- if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+ if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR &&
+ policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
{
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
index ee7c523bad..c12a8e733a 100644
--- a/src/cpu/operators/CpuScale.h
+++ b/src/cpu/operators/CpuScale.h
@@ -24,9 +24,10 @@
#ifndef ARM_COMPUTE_CPU_SCALE_H
#define ARM_COMPUTE_CPU_SCALE_H
+#include "arm_compute/core/experimental/Types.h"
#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
+
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
@@ -62,9 +63,9 @@ public:
void run(ITensorPack &tensors) override;
private:
- ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
- DataLayout _data_layout{ DataLayout::UNKNOWN };
- bool _is_prepared{ false };
+ ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED};
+ DataLayout _data_layout{DataLayout::UNKNOWN};
+ bool _is_prepared{false};
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index bf4c2fa3a2..e55d7f903e 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -25,9 +25,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/helpers/SoftmaxHelpers.h"
@@ -63,13 +64,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
- const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+ const unsigned int actual_axis =
+ static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
_needs_permute = actual_axis > 0;
- if(_needs_permute)
+ if (_needs_permute)
{
- _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+ _permute_input.configure(src, &_input_permuted,
+ softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
}
// We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -79,10 +82,11 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
// Create intermediate tensors shapes
TensorShape max_sum_shape = tmp_input->tensor_shape();
max_sum_shape.set(0, 1);
- const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
- DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
- TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
- TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+ const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+ DataType tmp_data_type =
+ is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
+ TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+ TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
// Init intermediate tensors
_max = TensorInfo(max_info);
@@ -94,13 +98,14 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
_max_kernel = std::move(mk);
auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
- if(_needs_permute)
+ if (_needs_permute)
{
// The normalization kernel stores the result in a permuted output tensor
sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
// Re-permute the permuted output into the requested (4D) output
- _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+ _permute_output.configure(&_output_permuted, dst,
+ softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
}
else
{
@@ -109,11 +114,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d
}
_softmax_kernel = std::move(sm);
- _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
- _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+ _aux_mem[InternalTensorIdx::MAX] =
+ MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
+ _aux_mem[InternalTensorIdx::TMP] =
+ MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
- _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
- _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
+ _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+ MemoryLifetime::Temporary, _input_permuted.total_size());
+ _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+ MemoryLifetime::Temporary, _output_permuted.total_size());
}
template <bool IS_LOG>
@@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
ARM_COMPUTE_UNUSED(beta);
- ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) ||
+ static_cast<int32_t>(src->num_dimensions()) <= axis);
// Create intermediate tensor info
DataType tmp_data_type = src->data_type();
@@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor
TensorShape max_sum_shape = src->tensor_shape();
max_sum_shape.set(0, 1);
- const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
+ const TensorInfo tensor_info_max_sum(src->clone()
+ ->set_tensor_shape(max_sum_shape)
+ .set_data_type(tmp_data_type)
+ .set_quantization_info(src->quantization_info())
+ .set_is_resizable(true));
const TensorInfo dont_care;
- const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+ const unsigned int actual_axis =
+ static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
const bool needs_permute = actual_axis > 0;
- if(needs_permute)
+ if (needs_permute)
{
- const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
- const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
- TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
+ const PermutationVector permutation_vector =
+ softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+ const TensorShape permuted_shape =
+ misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
+ TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
}
ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
- ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+ &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
return Status{};
}
@@ -166,43 +184,38 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
- CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
+ CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
+ true);
ITensorPack max_pack;
ITensorPack softmax_pack;
- if(_needs_permute)
+ if (_needs_permute)
{
- ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
+ ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
_permute_input.run(permute_in_pack);
- max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
+ max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
- softmax_pack =
- {
- { TensorType::ACL_SRC_0, input_permuted.get() },
- { TensorType::ACL_SRC_1, max.get() },
- { TensorType::ACL_DST_0, output_permuted.get() },
- { TensorType::ACL_DST_1, tmp.get() }
- };
+ softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
+ {TensorType::ACL_SRC_1, max.get()},
+ {TensorType::ACL_DST_0, output_permuted.get()},
+ {TensorType::ACL_DST_1, tmp.get()}};
}
else
{
- max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
-
- softmax_pack =
- {
- { TensorType::ACL_SRC_0, src },
- { TensorType::ACL_SRC_1, max.get() },
- { TensorType::ACL_DST_0, dst },
- { TensorType::ACL_DST_1, tmp.get() }
- };
+ max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
+
+ softmax_pack = {{TensorType::ACL_SRC_0, src},
+ {TensorType::ACL_SRC_1, max.get()},
+ {TensorType::ACL_DST_0, dst},
+ {TensorType::ACL_DST_1, tmp.get()}};
}
NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
- if(_needs_permute)
+ if (_needs_permute)
{
ITensorPack permute_out_pack;
permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
@@ -211,7 +224,7 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors)
}
}
-template <bool IS_LOG>
+template <bool IS_LOG>
experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
{
return _aux_mem;
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 64df8704f9..8cab70e14f 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -24,11 +24,13 @@
#ifndef ARM_COMPUTE_CPU_SOFTMAX_H
#define ARM_COMPUTE_CPU_SOFTMAX_H
-#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/TensorInfo.h"
+
#include "src/cpu/ICpuKernel.h"
#include "src/cpu/ICpuOperator.h"
#include "src/cpu/operators/CpuPermute.h"
+
#include <memory>
namespace arm_compute
@@ -77,7 +79,7 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private:
diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
index 91a5b6e63c..7d27efbc96 100644
--- a/src/cpu/operators/CpuSub.cpp
+++ b/src/cpu/operators/CpuSub.cpp
@@ -23,17 +23,20 @@
*/
#include "src/cpu/operators/CpuSub.h"
-#include "src/cpu/kernels/CpuSubKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuSubKernel.h"
namespace arm_compute
{
namespace cpu
{
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuSub::configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy);
@@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor
_kernel = std::move(k);
}
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuSub::validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return kernels::CpuSubKernel::validate(src0, src1, dst, policy);
diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
index 88908637aa..d1782a1d3c 100644
--- a/src/cpu/operators/CpuSub.h
+++ b/src/cpu/operators/CpuSub.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_SUB_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/cpu/ICpuOperator.h"
namespace arm_compute
@@ -53,14 +54,22 @@ public:
* @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
*/
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ void configure(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration
*
* Similar to @ref CpuSub::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *src0,
+ const ITensorInfo *src1,
+ const ITensorInfo *dst,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(ITensorPack &tensors) override;
diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
index 4e7854fd6e..ea548e0511 100644
--- a/src/cpu/operators/CpuTranspose.cpp
+++ b/src/cpu/operators/CpuTranspose.cpp
@@ -23,9 +23,8 @@
*/
#include "src/cpu/operators/CpuTranspose.h"
-#include "src/cpu/kernels/CpuTransposeKernel.h"
-
#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
namespace arm_compute
{
@@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
{
return kernels::CpuTransposeKernel::validate(src, dst);
}
-} // namesapce cpu
+} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index c4edd89964..9d07736c13 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -22,23 +22,25 @@
* SOFTWARE.
*/
#include "src/cpu/operators/CpuWinogradConv2d.h"
+
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
#include "src/core/NEON/kernels/assembly/winograd.hpp"
#include "src/core/NEON/kernels/convolution/common/tensor.hpp"
#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
#include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/operators/CpuActivation.h"
#include "src/cpu/operators/CpuPermute.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
@@ -56,21 +58,26 @@ namespace
inline Tensor4DShape internal_get_shape(const ITensorInfo *in)
{
const DataLayout data_layout = in->data_layout();
- const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
- const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
- const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
+ const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+ const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+ const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+ const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
- return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
+ return Tensor4DShape{in_batches, in_height, in_width, in_channels};
}
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(dst, weights);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
- if(biases != nullptr)
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+ "Winograd layer only supports unit strides.");
+ if (biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
return Status{};
}
-bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math,
- arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
+bool get_winograd_kernel_implementation(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ arm_conv::winograd::WinogradImpl *winograd_impl,
+ std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
{
arm_conv::winograd::WinogradConfig winograd_cfg;
arm_gemm::GemmConfig cfg;
const DataType data_type = src->data_type();
- Tensor4DShape in_shape{ internal_get_shape(src) };
- Tensor4DShape out_shape{ internal_get_shape(dst) };
- Tensor4DShape kernel_shape{ internal_get_shape(weights) };
+ Tensor4DShape in_shape{internal_get_shape(src)};
+ Tensor4DShape out_shape{internal_get_shape(dst)};
+ Tensor4DShape kernel_shape{internal_get_shape(weights)};
uint32_t nthreads = NEScheduler::get().num_threads();
// Get configuration arguments for Winograd
winograd_cfg.output_rows = 0;
winograd_cfg.output_cols = 0;
conv_args = std::make_unique<arm_conv::ConvolutionArgs>(
- in_shape.n_batches,
- arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) },
- in_shape.n_channels,
- conv_info.pad_top(),
- conv_info.pad_left(),
- arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) },
- out_shape.n_channels,
- arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) },
- assembly_utils::map_to_arm_gemm_activation(act_info));
+ in_shape.n_batches,
+ arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
+ in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(),
+ arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
+ out_shape.n_channels,
+ arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
+ assembly_utils::map_to_arm_gemm_activation(act_info));
bool success = false;
- if(data_type == DataType::F32)
+ if (data_type == DataType::F32)
{
- success = arm_conv::winograd::get_implementation<float>(
- *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+ success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+ enable_fast_math, &winograd_cfg, nullptr);
}
#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- else if(data_type == DataType::F16)
+ else if (data_type == DataType::F16)
{
- success = arm_conv::winograd::get_implementation<__fp16>(
- *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+ success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+ enable_fast_math, &winograd_cfg, nullptr);
}
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
else
@@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf
}
inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
{
- return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+ return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+ act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
}
} // namespace
@@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d()
_permute_output(std::make_unique<CpuPermute>()),
_permute_weights(std::make_unique<CpuPermute>()),
_aux_mem(AuxTensorIdx::Count),
- _conv_args{ nullptr },
+ _conv_args{nullptr},
_winograd_impl{},
_data_layout(),
_winograd_transformed_input{},
@@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d()
_weights_hwio(),
_input_nhwc(),
_output_nhwc(),
- _is_prepared{ false },
- _run_activation{ false }
+ _is_prepared{false},
+ _run_activation{false}
{
}
CpuWinogradConv2d::~CpuWinogradConv2d() = default;
-void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CpuWinogradConv2d::configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
@@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
const DataType data_type = src->data_type();
uint32_t nthreads = NEScheduler::get().num_threads();
_data_layout = src->data_layout();
- const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
-
- bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args);
-
- ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-
- const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
- if(has_impl)
+ const Tensor4DShape kernel_shape{internal_get_shape(weights)};
+
+ bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+ &_winograd_impl, _conv_args);
+
+ ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+ kernel_shape.n_cols);
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+ _winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+ _winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+ _winograd_impl.input_transform->get_name().c_str());
+
+ const bool has_impl = ((_winograd_impl.input_transform != nullptr) &&
+ (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
+ if (has_impl)
{
// Determine how much working space is required, allocate it.
- const size_t input_workspace_size = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
- const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
+ const size_t input_workspace_size =
+ _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
+ const size_t output_workspace_size =
+ _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
@@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
// Configure the kernel to transform the input tensor from NCHW -> NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
_permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
@@ -242,28 +266,30 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
_permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
// Reorder the convoluted output to ACL's ordering NCHW
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
// configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
- dst->dimension(1), dst->dimension(3)),
- 1, dst->data_type());
+ TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1,
+ dst->data_type());
_output_nhwc = info;
_permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
}
// Configure input transform kernel
- _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
+ _transform_input_kernel =
+ std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
// Configure GEMM function
- _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f);
+ _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr,
+ &_winograd_transformed_output, 1.0f, 0.f);
// Configure output transform kernel
- _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
+ _transform_output_kernel =
+ std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
//Configure Activation Layer
_run_activation = act_info.enabled() && !fuse_function_supported(act_info);
- if(_run_activation)
+ if (_run_activation)
{
_activation_func->configure(dst, nullptr, act_info);
}
@@ -276,40 +302,55 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei
_aux_mem[TempResult] = asm_mem_req[TempResult];
// Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
- _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment);
- _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment);
- _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
- _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
- _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment);
- if(_data_layout == DataLayout::NCHW)
+ _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
+ wds.input_matrix_size_bytes, storage_alignment);
+ _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary,
+ wds.output_matrix_size_bytes, storage_alignment);
+ _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary,
+ std::max(input_workspace_size, output_workspace_size));
+ _aux_mem[PermutedWeights] =
+ MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+ _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent,
+ wds.weight_matrix_size_bytes, storage_alignment);
+ if (_data_layout == DataLayout::NCHW)
{
_aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
_aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
}
}
}
-Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
- const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CpuWinogradConv2d::validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
// Disable winograd for fp16 if fast math is false.
- if(!enable_fast_math)
+ if (!enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
}
- const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
+ const Tensor4DShape kernel_shape{internal_get_shape(weights)};
arm_conv::winograd::WinogradImpl winograd_impl{};
std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
- const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str());
- ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str());
+ const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+ &winograd_impl, conv_args);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+ kernel_shape.n_cols);
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+ winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+ winograd_impl.input_transform->get_name().c_str());
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+ winograd_impl.input_transform->get_name().c_str());
return Status{};
}
@@ -328,24 +369,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
// Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory.
CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
- CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true);
+ CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input,
+ tensors, true);
CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
const bool is_nchw = _data_layout == DataLayout::NCHW;
- if(is_nchw)
+ if (is_nchw)
{
//Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } };
+ ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}};
_permute_input->run(pack);
}
- CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true);
+ CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output,
+ tensors, true);
CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
- ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } };
+ ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src},
+ {ACL_DST, winograd_input_transformed.get()},
+ {ACL_INT, input_workspace.get()}};
NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack);
- CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true);
+ CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights,
+ tensors, true);
// Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
ITensorPack gemm_pack = tensors;
@@ -356,30 +402,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors)
_gemm_function->run(gemm_pack);
// Output transform
- ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } };
+ ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()},
+ {ACL_DST, is_nchw ? output_nhwc.get() : output},
+ {ACL_SRC_1, biases},
+ {ACL_INT, output_workspace.get()}};
NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack);
- if(is_nchw)
+ if (is_nchw)
{
// Reorder the convoluted output to ACL's ordering NCHW
- ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } };
+ ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}};
_permute_output->run(pack);
}
- if(_run_activation)
+ if (_run_activation)
{
- ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } };
+ ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}};
_activation_func->run(pack);
}
}
void CpuWinogradConv2d::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
- const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
- ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+ const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+ ITensor *weights_aux =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
- ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+ ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
_permute_weights->run(permute_tensors);
const int element_size_in_bytes = permuted_weights.get()->info()->element_size();
// Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format.
@@ -387,31 +437,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors)
const unsigned int width_idx = 2; // W in HWIO
const unsigned int channel_idx = 1; // I in HWIO
- const int permuted_weight_row_stride = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
- const int permuted_weight_col_stride = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
- const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
+ const int permuted_weight_row_stride =
+ permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
+ const int permuted_weight_col_stride =
+ permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
+ const int permuted_weight_channel_stride =
+ permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
// Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory.
- ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+ ITensor *weights_transf =
+ utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
const void *permuted_weights_ptr;
void *win_wght_transf_ptr;
- permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
- win_wght_transf_ptr = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
+ permuted_weights_ptr = reinterpret_cast<const void *>(
+ permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
+ win_wght_transf_ptr =
+ reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() +
+ winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
// Prepare Weights
_winograd_impl.weight_transform->execute(
- *_conv_args,
- permuted_weights_ptr,
- permuted_weight_row_stride,
- permuted_weight_col_stride,
- permuted_weight_channel_stride,
- win_wght_transf_ptr,
- _winograd_impl.winograd_spec,
- 0, 1 // Thread 1 of 1
+ *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
+ permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1
);
ITensorPack gemm_pack = tensors;
gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get());
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
index e0df34e2db..7e1d952462 100644
--- a/src/cpu/operators/CpuWinogradConv2d.h
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -26,10 +26,11 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/runtime/FunctionDescriptors.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
#include "src/cpu/operators/CpuActivation.h"
#include "src/cpu/operators/CpuGemm.h"
#include "src/cpu/operators/CpuPermute.h"
@@ -73,7 +74,11 @@ public:
* @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
* available which may introduce a drop of accuracy as well. Default is false
*/
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+ void configure(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo(),
bool enable_fast_math = false);
/** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
@@ -82,13 +87,17 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ static Status validate(const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info = ActivationLayerInfo(),
bool enable_fast_math = false);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &constants) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &constants) override;
experimental::MemoryRequirements workspace() const override;
private:
@@ -108,27 +117,28 @@ private:
PermutedOutput = TransformedInput,
Count = 10
};
- std::unique_ptr<CpuGemm> _gemm_function;
- std::unique_ptr<CpuActivation> _activation_func;
- std::unique_ptr<ICPPKernel> _transform_input_kernel;
- std::unique_ptr<ICPPKernel> _transform_output_kernel;
- std::unique_ptr<CpuPermute> _permute_input;
- std::unique_ptr<CpuPermute> _permute_output;
- std::unique_ptr<CpuPermute> _permute_weights;
- experimental::MemoryRequirements _aux_mem{ Count };
- std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor
- arm_conv::winograd::WinogradImpl _winograd_impl;
- DataLayout _data_layout;
- TensorInfo _winograd_transformed_input;
- TensorInfo _winograd_transformed_output;
- TensorInfo _winograd_transformed_weights;
- TensorInfo _input_workspace;
- TensorInfo _output_workspace;
- TensorInfo _weights_hwio;
- TensorInfo _input_nhwc;
- TensorInfo _output_nhwc;
- bool _is_prepared;
- bool _run_activation;
+ std::unique_ptr<CpuGemm> _gemm_function;
+ std::unique_ptr<CpuActivation> _activation_func;
+ std::unique_ptr<ICPPKernel> _transform_input_kernel;
+ std::unique_ptr<ICPPKernel> _transform_output_kernel;
+ std::unique_ptr<CpuPermute> _permute_input;
+ std::unique_ptr<CpuPermute> _permute_output;
+ std::unique_ptr<CpuPermute> _permute_weights;
+ experimental::MemoryRequirements _aux_mem{Count};
+ std::unique_ptr<arm_conv::ConvolutionArgs>
+ _conv_args; // Make it unique ptr because this type does not have a default constructor
+ arm_conv::winograd::WinogradImpl _winograd_impl;
+ DataLayout _data_layout;
+ TensorInfo _winograd_transformed_input;
+ TensorInfo _winograd_transformed_output;
+ TensorInfo _winograd_transformed_weights;
+ TensorInfo _input_workspace;
+ TensorInfo _output_workspace;
+ TensorInfo _weights_hwio;
+ TensorInfo _input_nhwc;
+ TensorInfo _output_nhwc;
+ bool _is_prepared;
+ bool _run_activation;
};
} // namespace cpu
} // namespace arm_compute
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 3069d6b541..343ef21c0b 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -24,12 +24,13 @@
#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
#include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
#include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
#include "src/cpu/utils/CpuAuxTensorHandler.h"
#include <arm_neon.h>
@@ -53,7 +54,12 @@ namespace
* @param[in] num_threads Number of threads to run this method. Must be >= 1
*/
template <typename TypeInput, typename TypeOutput>
-void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm,
+ ITensor *dst,
+ const TypeInput *src,
+ int src_ld,
+ int src_multi_stride,
+ unsigned int num_threads)
{
ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
ARM_COMPUTE_ERROR_ON(num_threads == 0);
@@ -61,14 +67,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutpu
const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
std::vector<IScheduler::Workload> workloads(num_threads);
- for(unsigned int t = 0; t < num_threads; ++t)
+ for (unsigned int t = 0; t < num_threads; ++t)
{
- workloads[t] = [ = ](const ThreadInfo & info)
+ workloads[t] = [=](const ThreadInfo &info)
{
const unsigned int start = (info.thread_id * wsize) / num_threads;
const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads;
- if(start < end)
+ if (start < end)
{
gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
}
@@ -113,7 +119,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
p.sections = 1;
p.indirect = false;
- if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+ if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
{
p.indirect = true;
p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
@@ -125,7 +131,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen
}
// Update M in case of GEMM3D for output
- if(info.depth_output_gemm3d != 0)
+ if (info.depth_output_gemm3d != 0)
{
p.M = d->tensor_shape().y() * d->tensor_shape().z();
p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
@@ -139,19 +145,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp
// Schedule assembly kernel
const int granule_threshold = 200;
IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
- if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+ if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
{
scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
}
- else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+ else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D &&
+ (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 ||
+ data_type == DataType::S8))
{
//GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ scheduling_hint =
+ IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
}
- else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+ else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D &&
+ (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
{
//special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ scheduling_hint =
+ IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
}
return scheduling_hint;
@@ -175,8 +186,12 @@ public:
* @param[in] gemm_info GEMM meta-data
* @param[in] os Output stage meta-data.
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+ void configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::GemmArgs args,
+ const AsmGemmInfo &gemm_info,
const OutputStage &os = {});
/** Set requantization shifts to be used
@@ -193,19 +208,20 @@ public:
*
* @return A tuple with the pointers to the shift and multiplier data respectively
*/
- std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
- const std::vector<int32_t> &multipliers);
+ std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+ set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers);
// Inherited methods overridden:
- void run(ITensorPack &tensors) override;
- void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
bool is_configured() const override;
experimental::MemoryRequirements workspace() const override;
bool isVarWeightsKernel() const override
{
- if(!_gemm_kernel_asm)
+ if (!_gemm_kernel_asm)
return false;
- const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
+ const arm_compute::WeightFormat wf =
+ assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY;
}
@@ -229,15 +245,15 @@ private:
void prepare_indirect_buffer(ITensorPack &tensors);
/** Assembly Gemm kernel */
- std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+ std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
/** Optimised Arm® Neon™ kernel */
- std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+ std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
/** Assembly GEMM workspace tensor info */
TensorInfo _workspace_info{};
/** Pre-transpose tensor info */
TensorInfo _pretranspose_info{};
/** Prepared flag */
- bool _is_prepared{ false };
+ bool _is_prepared{false};
/** GEMM meta-data */
AsmGemmInfo _gemm_info{};
/** GEMM kernel description */
@@ -251,26 +267,27 @@ private:
/** Indirect buffer */
std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
- std::vector<TypeInput> _indirect_pad{};
- arm_gemm::ConvolutionParameters _cp{};
- experimental::MemoryRequirements _aux_mem{ Count };
- bool _B_pretranspose_required{ false };
- bool _is_b_constant{ true };
- bool _is_c_constant{ true };
+ std::vector<TypeInput> _indirect_pad{};
+ arm_gemm::ConvolutionParameters _cp{};
+ experimental::MemoryRequirements _aux_mem{Count};
+ bool _B_pretranspose_required{false};
+ bool _is_b_constant{true};
+ bool _is_c_constant{true};
};
template <typename TypeInput, typename TypeOutput, class OutputStage>
std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+ const std::vector<int32_t> &multipliers)
{
_multipliers = multipliers;
_shifts = shifts;
bool need_left = false;
- for(const auto s : _shifts)
+ for (const auto s : _shifts)
{
left_shifts.push_back(std::max(-s, int32_t(0)));
right_shifts.push_back(std::min(-s, int32_t(0)));
- if(s < 0 && !need_left)
+ if (s < 0 && !need_left)
{
need_left = true;
}
@@ -295,32 +312,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
const int multi_size = batch_size * batches;
const size_t multi_stride = multi_size / sizeof(TypeInput);
- for(int64_t m = 0; m < multis; m++)
+ for (int64_t m = 0; m < multis; m++)
{
- for(int64_t b = 0; b < batches; b++)
+ for (int64_t b = 0; b < batches; b++)
{
- for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+ for (int64_t output_y = 0; output_y < _cp.output_height; output_y++)
{
- for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+ for (int64_t output_x = 0; output_x < _cp.output_width; output_x++)
{
int64_t output_xy = (output_y * _cp.output_width) + output_x;
- for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+ for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
{
- for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+ for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
{
int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
int64_t input_xy = (input_y * _cp.input_width) + input_x;
- if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+ if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
{
- _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+ _indirect_buf
+ .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+ _indirect_pad.data();
}
else
{
- _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+ _indirect_buf
+ .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
}
}
@@ -332,12 +352,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
float zeropad = 0.f;
- if(is_data_type_quantized(a->data_type()))
+ if (is_data_type_quantized(a->data_type()))
{
zeropad = a->quantization_info().uniform().offset;
}
@@ -350,16 +373,25 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]);
const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]);
- _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
- info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
- };
-
- if(info.method == AsmConvMethod::Conv)
+ _cp = {input_width,
+ input_height,
+ input_channels,
+ kernel_width,
+ kernel_height,
+ output_width,
+ output_height,
+ info.ps_info.stride().first,
+ info.ps_info.stride().second,
+ info.padding_top,
+ info.padding_left,
+ zeropad};
+
+ if (info.method == AsmConvMethod::Conv)
{
_gemm_kernel_asm->set_convolution_parameters(_cp);
}
- if(info.method == AsmConvMethod::Indirect)
+ if (info.method == AsmConvMethod::Indirect)
{
const unsigned int multis = 1;
const unsigned int batches = a->tensor_shape().total_size_upper(3);
@@ -372,19 +404,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
const int multi_size = batch_size * batches;
const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
- _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
- _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+ _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
+ reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+ _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
+ reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
_indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
// Set indirect argument
int64_t pos = 0;
- for(int64_t m = 0; m < multis; m++)
+ for (int64_t m = 0; m < multis; m++)
{
- for(int64_t b = 0; b < batches; b++)
+ for (int64_t b = 0; b < batches; b++)
{
- for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+ for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
{
- (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+ (_indirect_arg.get())[pos++] =
+ _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
}
}
}
@@ -394,8 +429,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::GemmArgs args,
+ const AsmGemmInfo &gemm_info,
const OutputStage &os)
{
ARM_COMPUTE_UNUSED(c);
@@ -404,7 +443,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
_is_c_constant = c ? c->are_values_constant() : true;
_gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
- if(_gemm_kernel_asm == nullptr)
+ if (_gemm_kernel_asm == nullptr)
{
//configuration not supported: Leave function unconfigured:
return;
@@ -419,13 +458,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
const size_t workspace_size = _gemm_kernel_asm->get_working_size();
const unsigned int alignment = 4096;
_workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
- _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+ _aux_mem[AsmGemmWorkspace] =
+ MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
//if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
//the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
{
const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
- if(window_size < static_cast<unsigned int>(args._maxthreads))
+ if (window_size < static_cast<unsigned int>(args._maxthreads))
{
_gemm_kernel_asm->set_nthreads(window_size);
}
@@ -434,18 +474,19 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
_optimised_kernel = std::move(acl_gemm_wrapper);
_gemm_info = gemm_info;
// Check for pre-transposed support
- if(_gemm_kernel_asm->B_pretranspose_required())
+ if (_gemm_kernel_asm->B_pretranspose_required())
{
// Forcing 128-byte alignment (required by 32-bit kernels)
const unsigned int alignment = 128;
const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
_pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
- _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
- _B_pretranspose_required = true;
+ _aux_mem[Pretranspose] =
+ MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+ _B_pretranspose_required = true;
}
// Handle indirect GEMM convolution
- if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+ if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
{
configure_indirect(a, b, d, gemm_info);
}
@@ -454,34 +495,39 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
template <typename TypeInput, typename TypeOutput, class OutputStage>
void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
// Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
- if(c && c->info()->data_type() == DataType::S32)
+ if (c && c->info()->data_type() == DataType::S32)
{
- _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+ _gemm_kernel_asm->set_quantized_bias(
+ reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
}
// Pretranspose B if required
- if(_gemm_kernel_asm->B_pretranspose_required())
+ if (_gemm_kernel_asm->B_pretranspose_required())
{
// Fixed format kernels need no pretranspose.
- ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
- const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
- const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
- const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+ ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+ assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+ const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+ const auto in1_ptr =
+ reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+ const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
- run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+ run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+ in1_ptr, ldb, multi_stride_b,
+ NEScheduler::get().num_threads());
b->mark_as_unused();
}
- if(_gemm_info.method == AsmConvMethod::Indirect)
+ if (_gemm_info.method == AsmConvMethod::Indirect)
{
prepare_indirect_buffer(tensors);
}
@@ -526,12 +572,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
int multi_stride_b = 0;
const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
- auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+ auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
const TypeInput *in1_ptr = nullptr;
auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
// Check if B is pre-tranposed and de-reference if not
- if(!_gemm_kernel_asm->B_is_pretransposed())
+ if (!_gemm_kernel_asm->B_is_pretransposed())
{
ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
@@ -539,30 +585,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
}
// If necessary, run pretranspose every time if either weights or biases are non-constant
- if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+ if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
{
- if(c && c->info()->data_type() == DataType::S32)
+ if (c && c->info()->data_type() == DataType::S32)
{
- _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+ _gemm_kernel_asm->set_quantized_bias(
+ reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
}
// Pretranspose B if required
- if(_B_pretranspose_required)
+ if (_B_pretranspose_required)
{
- const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
- const auto b_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
- const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+ const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+ const auto b_ptr =
+ reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+ const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
- if(_is_b_constant)
+ if (_is_b_constant)
{
_gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
}
else
{
- run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+ run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+ b_ptr, ldb, multi_stride_b,
+ NEScheduler::get().num_threads());
}
}
}
@@ -571,17 +621,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
// Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
- if(workspace.get()->buffer() != nullptr)
+ if (workspace.get()->buffer() != nullptr)
{
_gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
const unsigned int split_dim = scheduling_hint.split_dimension();
const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
unsigned int num_threads = NEScheduler::get().num_threads();
- if(window_size < num_threads)
+ if (window_size < num_threads)
{
num_threads = window_size;
}
- if(split_dim != IScheduler::split_dimensions_all)
+ if (split_dim != IScheduler::split_dimensions_all)
{
// Make sure the kernel does not expect more threads than we can actually spawn
const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
@@ -595,12 +645,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
// Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
TypeOutput *bias = nullptr;
- if(c && c->info()->data_type() != DataType::S32)
+ if (c && c->info()->data_type() != DataType::S32)
{
bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
}
- if(_gemm_info.method == AsmConvMethod::Indirect)
+ if (_gemm_info.method == AsmConvMethod::Indirect)
{
in0_ptr = nullptr;
lda = 0;
@@ -609,18 +659,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
}
// Set gemm parameters
- _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
- in1_ptr, ldb, multi_stride_b,
- out_ptr, ldd, batch_stride_d, multi_stride_d,
- bias, 0);
+ _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr,
+ ldd, batch_stride_d, multi_stride_d, bias, 0);
// Schedule
NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
}
template <typename TypeInput, typename TypeOutput>
void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
- const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::Activation activation, const AsmGemmInfo &info)
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::Activation activation,
+ const AsmGemmInfo &info)
{
Params p = extract_parameters(a, b, d, info);
const CPUInfo &ci = NEScheduler::get().cpu_info();
@@ -628,7 +680,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+ info.fixed_format, info.fast_mode, &cfg);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -638,8 +691,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
template <typename TypeInput, typename TypeOutput>
void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
- const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
- arm_gemm::Activation activation, const AsmGemmInfo &info)
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::Activation activation,
+ const AsmGemmInfo &info)
{
ARM_COMPUTE_UNUSED(activation);
Params p = extract_parameters(a, b, d, info);
@@ -648,7 +705,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+ info.fixed_format, info.fast_mode, &cfg);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -660,22 +718,20 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
const GEMMLowpOutputStageInfo os_info = info.output_stage;
arm_gemm::Requantize32 gemm_requant_info{};
- if(os_info.gemmlowp_shifts.size() > 1)
+ if (os_info.gemmlowp_shifts.size() > 1)
{
- const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
- gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
- a_offset, b_offset, os_info.gemmlowp_offset,
- (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
- std::get<2>(requantize_data),
- std::get<3>(requantize_data),
- os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+ const auto requantize_data =
+ fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+ gemm_requant_info = arm_gemm::Requantize32(
+ nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset,
+ (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data),
+ std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
}
else
{
- gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
- a_offset, b_offset, os_info.gemmlowp_offset,
- -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
- os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+ gemm_requant_info =
+ arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift,
+ os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
}
// Configure fallback
@@ -684,13 +740,16 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
}
} //namespace
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
- : _arm_gemm(nullptr)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr)
{
}
-Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
- const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_UNUSED(c);
@@ -701,53 +760,61 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg);
- switch(a->data_type())
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
+ info.fixed_format, info.fast_mode, &cfg);
+ switch (a->data_type())
{
case DataType::F32:
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for F32 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for F32 input");
break;
#ifdef __aarch64__
case DataType::U8:
case DataType::QASYMM8:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for U8 input and U8 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for U8 input and U8 output");
}
break;
case DataType::S8:
case DataType::QASYMM8_SIGNED:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for S8 input and S8 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for S8 input and S8 output");
}
break;
#endif /* __aarch64__ */
#if defined(ARM_COMPUTE_ENABLE_BF16)
case DataType::BFLOAT16:
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for BFLOAT16 input and F32 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for BFLOAT16 input and F32 output");
break;
}
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
- "We could not find an optimized kernel for F16 input and F16 output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+ "We could not find an optimized kernel for F16 input and F16 output");
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
@@ -759,26 +826,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
return Status{};
}
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::validate(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
{
ARM_COMPUTE_UNUSED(c, info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run),
+ "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
#ifndef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- if(is_data_type_quantized_per_channel(b->data_type()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+ b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::BFLOAT16, DataType::F16, DataType::F32);
+ if (is_data_type_quantized_per_channel(b->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
}
- else if(is_fixed_format_fast_math(info.weight_format))
+ else if (is_fixed_format_fast_math(info.weight_format))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -787,22 +858,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32,
+ "Only F32 output supported for F32 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16,
+ "Only F16 output supported for F16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32,
+ "Only F32 output supported for BFLOAT16 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32,
+ "Only U32 output supported for U8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32,
+ "Only S32 output supported for S8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 &&
+ (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
"Only QASYMM8/S32 output supported for QASYMM8 input");
arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
- if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
+ if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
{
// Correctness check: if the format expected by the kernel is
// not "any", make sure that the one found matches the format
// intended by the caller.
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format),
- "The format expected by the kernel does not correspond with the one requested by the user.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ (expected_weight_format != info.weight_format),
+ "The format expected by the kernel does not correspond with the one requested by the user.");
}
return ret;
}
@@ -813,18 +891,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo
return act.type != arm_gemm::Activation::Type::None;
}
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
//If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+ if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
{
return;
}
- switch(a->data_type())
+ switch (a->data_type())
{
case DataType::F32:
create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
@@ -832,7 +911,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
#ifdef __aarch64__
case DataType::U8:
case DataType::QASYMM8:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
}
@@ -843,7 +922,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo
break;
case DataType::S8:
case DataType::QASYMM8_SIGNED:
- if(d->data_type() == DataType::S32)
+ if (d->data_type() == DataType::S32)
{
create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
}
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index ceb7a3f775..5be39a54c0 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/common/Macros.h"
#include "src/cpu/ICpuOperator.h"
@@ -42,20 +43,20 @@ enum class AsmConvMethod
struct AsmGemmInfo
{
- AsmConvMethod method{ AsmConvMethod::Im2Col };
+ AsmConvMethod method{AsmConvMethod::Im2Col};
PadStrideInfo ps_info{};
ActivationLayerInfo activation_info{};
GEMMLowpOutputStageInfo output_stage{};
- bool negated_offsets{ true };
- bool reinterpret_input_as_3d{ false };
- bool depth_output_gemm3d{ false };
- int64_t padding_top{ 0 };
- int64_t padding_left{ 0 };
- float padding_value{ 0.f };
- bool fast_mode{ false };
- bool fixed_format{ false };
- arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
- bool reshape_b_only_on_first_run{ true };
+ bool negated_offsets{true};
+ bool reinterpret_input_as_3d{false};
+ bool depth_output_gemm3d{false};
+ int64_t padding_top{0};
+ int64_t padding_left{0};
+ float padding_value{0.f};
+ bool fast_mode{false};
+ bool fixed_format{false};
+ arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
+ bool reshape_b_only_on_first_run{true};
};
/** Assembly kernel glue */
@@ -72,12 +73,12 @@ public:
class IFallback
{
public:
- virtual void run(ITensorPack &tensors) = 0;
- virtual void prepare(ITensorPack &tensors) = 0;
- virtual experimental::MemoryRequirements workspace() const = 0;
- virtual bool is_configured() const = 0;
- virtual bool isVarWeightsKernel() const = 0;
- virtual ~IFallback() = default;
+ virtual void run(ITensorPack &tensors) = 0;
+ virtual void prepare(ITensorPack &tensors) = 0;
+ virtual experimental::MemoryRequirements workspace() const = 0;
+ virtual bool is_configured() const = 0;
+ virtual bool isVarWeightsKernel() const = 0;
+ virtual ~IFallback() = default;
};
public:
@@ -121,7 +122,8 @@ public:
* @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
* @param[in] info GEMM meta-data
*/
- void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+ void configure(
+ const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
/** Indicates whether or not this function can be used to process the given parameters.
*
@@ -133,7 +135,11 @@ public:
*
* @return a status.
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+ static Status validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info);
/** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
*
@@ -144,7 +150,12 @@ public:
*
* @return a status.
*/
- static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+ static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *d,
+ const AsmGemmInfo &info);
/** Checks if activation is supported by the gemm assembly dispatcher
*
* @param[in] activation Activation to check
@@ -167,8 +178,8 @@ public:
}
// Inherited methods overridden:
- void prepare(ITensorPack &tensors) override;
- void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ void run(ITensorPack &tensors) override;
experimental::MemoryRequirements workspace() const override;
private: