From f2c022e54ae65467cca4bc698b5b94e5b3c62c47 Mon Sep 17 00:00:00 2001 From: cfRod Date: Fri, 5 Nov 2021 11:29:53 +0000 Subject: Enable fast_math in CpuFullyConnected ONCPUML-529 * Add support for passing fast_math for fullyconnected layers via fc_info. * Add support for passing fast_math to run ACL benchmark graphs. * Add validation test and accuracy tests (updated fixtures). Note: abs and rel. tolerance for fast math mode are set based on experimental data. Signed-off-by: cfRod change-Id: Ib107d6264d3ae5e36555334f39a13e678f8618df Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6521 Reviewed-by: SiCong Li Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/Types.h | 9 +++++++++ arm_compute/graph/GraphBuilder.h | 8 ++++++-- arm_compute/graph/backends/FunctionHelpers.h | 3 ++- arm_compute/graph/frontend/Layers.h | 4 ++-- arm_compute/graph/nodes/FullyConnectedLayerNode.h | 17 +++++++++++++++-- src/cpu/operators/CpuFullyConnected.cpp | 15 +++++++++++---- src/cpu/operators/CpuFullyConnected.h | 1 + src/graph/GraphBuilder.cpp | 8 ++++---- src/graph/nodes/FullyConnectedLayer.cpp | 15 ++++++++++++--- 9 files changed, 62 insertions(+), 18 deletions(-) diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index bff672c361..47df44cb67 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1578,6 +1578,7 @@ struct FullyConnectedLayerInfo bool transpose_weights{ true }; /**< Transpose weights if true. */ bool are_weights_reshaped{ false }; /**< Reshape the weights tensor if false. */ bool retain_internal_weights{ false }; /**< Retain internal reshaped weights. */ + bool enable_fast_math{ false }; /**< Enable fast math computation. */ /* Other parameters */ bool fp_mixed_precision{ false }; /**< Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */ @@ -2107,6 +2108,14 @@ public: { return _fast_math; }; + /** Set fast math flag + * + * @param[in] fast_math Flag to set + */ + void set_fast_math(bool fast_math) + { + _fast_math = fast_math; + } /** Flag which specifies whether to broadcast the shape of the bias tensor. * * @return True if the shape of the bias tensor is to be broadcasted. diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h index 14ad0571ef..cb88c0e7aa 100644 --- a/arm_compute/graph/GraphBuilder.h +++ b/arm_compute/graph/GraphBuilder.h @@ -295,13 +295,15 @@ public: * @param[in] bias_nid (Optional) Node ID of the bias node data. Defaults to EmptyNodeID * @param[in] fc_info (Optional) Fully connected layer metadata * @param[in] out_quant_info (Optional) Output quantization info + * @param[in] fast_math_hint (Optional) Fast math hint * * @return Node ID of the created node, EmptyNodeID in case of error */ static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs, NodeID weights_nid, NodeID bias_nid = EmptyNodeID, const FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), - const QuantizationInfo &out_quant_info = QuantizationInfo()); + const QuantizationInfo &out_quant_info = QuantizationInfo(), + FastMathHint fast_math_hint = FastMathHint::Disabled); /** Adds a fully connected layer node to the graph * * @param[in] g Graph to add the layer to @@ -313,6 +315,7 @@ public: * @param[in] fc_info (Optional) Fully connected layer metadata * @param[in] weights_quant_info (Optional) Weights quantization info * @param[in] out_quant_info (Optional) Output quantization info + * @param[in] fast_math_hint (Optional) Fast math hint * * @return Node ID of the created node, EmptyNodeID in case of error */ @@ -320,7 +323,8 @@ public: ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr, const FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const QuantizationInfo &weights_quant_info = QuantizationInfo(), - const QuantizationInfo &out_quant_info = QuantizationInfo()); + const QuantizationInfo &out_quant_info = QuantizationInfo(), + FastMathHint fast_math_hint = FastMathHint::Disabled); /** Adds a generate proposals layer node to the graph * * @param[in] g Graph to add the layer to diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h index 6aec3f6590..55af056a43 100644 --- a/arm_compute/graph/backends/FunctionHelpers.h +++ b/arm_compute/graph/backends/FunctionHelpers.h @@ -1096,7 +1096,8 @@ std::unique_ptr create_fully_connected_layer(FullyConnectedLayerNode typename TargetInfo::TensorType *weights = get_backing_tensor(node.input(1)); typename TargetInfo::TensorType *biases = get_backing_tensor(node.input(2)); typename TargetInfo::TensorType *output = get_backing_tensor(node.output(0)); - const FullyConnectedLayerInfo fc_info = node.info(); + FullyConnectedLayerInfo fc_info = node.info(); + fc_info.enable_fast_math = (node.fast_math_hint() == FastMathHint::Enabled); ARM_COMPUTE_ERROR_ON(input == nullptr); ARM_COMPUTE_ERROR_ON(weights == nullptr); diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h index bf68b269da..fe0539bac5 100644 --- a/arm_compute/graph/frontend/Layers.h +++ b/arm_compute/graph/frontend/Layers.h @@ -776,7 +776,7 @@ public: { return GraphBuilder::add_fully_connected_layer(s.graph(), common_params, input, _num_outputs, std::move(_weights), std::move(_bias), _fc_info, - std::move(_weights_quant_info), std::move(_out_quant_info)); + std::move(_weights_quant_info), std::move(_out_quant_info), s.hints().fast_math_hint); } else { @@ -785,7 +785,7 @@ public: NodeID bias_nid = (_bias_ss == nullptr) ? EmptyNodeID : _bias_ss->tail_node(); return GraphBuilder::add_fully_connected_layer(s.graph(), common_params, input, _num_outputs, _weights_ss->tail_node(), bias_nid, _fc_info, - std::move(_out_quant_info)); + std::move(_out_quant_info), s.hints().fast_math_hint); } } diff --git a/arm_compute/graph/nodes/FullyConnectedLayerNode.h b/arm_compute/graph/nodes/FullyConnectedLayerNode.h index a7712f46b9..9ade62bf4a 100644 --- a/arm_compute/graph/nodes/FullyConnectedLayerNode.h +++ b/arm_compute/graph/nodes/FullyConnectedLayerNode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,10 +39,22 @@ public: * @param[in] num_outputs Number of neurons in the layer * @param[in] out_quant_info (Optional) Output quantization info * @param[in] fc_info (Optional) Additional information about the fully connected layer + * @param[in] fast_math_hint (Optional) Fast math hint */ FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info = QuantizationInfo(), - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + FastMathHint fast_math_hint = FastMathHint::Disabled); + /** Sets the fast math fast hint + * + * @param[in] hint Hint to use for fullyconnected layer + */ + void set_fast_math_hint(FastMathHint hint); + /** Fast math hint accessor + * + * @return Fast math hint to be used by the node + */ + FastMathHint fast_math_hint() const; /** Sets fused activation * * @param[in] fused_activation Fused activation to set @@ -94,6 +106,7 @@ private: unsigned int _num_outputs; QuantizationInfo _out_quant_info; FullyConnectedLayerInfo _info; + FastMathHint _fast_math_hint; }; } // namespace graph } // namespace arm_compute diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp index 03c53b001d..6d77c614f7 100644 --- a/src/cpu/operators/CpuFullyConnected.cpp +++ b/src/cpu/operators/CpuFullyConnected.cpp @@ -109,7 +109,7 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act) +Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math) { if(is_data_type_quantized_asymmetric(src->data_type())) { @@ -123,6 +123,7 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe GEMMInfo gemm_info; gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); + gemm_info.set_fast_math(enable_fast_math); // Validate gemmlowp function TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); @@ -135,7 +136,9 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe } else { - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */))); + GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */); + gemm_info.set_fast_math(enable_fast_math); + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, gemm_info)); } return Status{}; @@ -158,7 +161,8 @@ CpuFullyConnected::CpuFullyConnected() _needs_weights_reshape(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), - _is_prepared(false) + _is_prepared(false), + _enable_fast_math(false) { } @@ -185,6 +189,7 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo * GEMMInfo gemm_info; gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); gemm_info.set_activation_info(act); + gemm_info.set_fast_math(_enable_fast_math); _mm_gemmlowp = std::make_unique(); _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info); } @@ -193,6 +198,7 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo * // Configure matrix multiply kernel GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */); gemm_info.set_activation_info(act); + gemm_info.set_fast_math(_enable_fast_math); _mm_gemm = std::make_unique(); _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info); } @@ -241,6 +247,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); _is_prepared = false; _trans_weights_idx = AuxTensorIdx::Count; + _enable_fast_math = fc_info.enable_fast_math; // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -418,7 +425,7 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); } // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math)); return Status{}; } diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h index 304ea3c62b..44fa21f9f8 100644 --- a/src/cpu/operators/CpuFullyConnected.h +++ b/src/cpu/operators/CpuFullyConnected.h @@ -141,6 +141,7 @@ private: bool _is_fc_after_conv; bool _is_quantized_asymmetric; bool _is_prepared; + bool _enable_fast_math; }; } // namespace cpu } // namespace arm_compute diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp index 01d35a15b9..15abf3738a 100644 --- a/src/graph/GraphBuilder.cpp +++ b/src/graph/GraphBuilder.cpp @@ -467,7 +467,7 @@ NodeID GraphBuilder::add_flatten_node(Graph &g, NodeParams params, NodeIdxPair i NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs, NodeID weights_nid, NodeID bias_nid, - const FullyConnectedLayerInfo fc_info, const QuantizationInfo &out_quant_info) + const FullyConnectedLayerInfo fc_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint) { check_nodeidx_pair(input, g); ARM_COMPUTE_ERROR_ON(num_outputs == 0); @@ -479,7 +479,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]); // Create fully connected node and connect - NodeID fc_nid = g.add_node(num_outputs, out_quant_info, fc_info); + NodeID fc_nid = g.add_node(num_outputs, out_quant_info, fc_info, fast_math_hint); g.add_connection(input.node_id, input.index, fc_nid, 0); g.add_connection(weights_nid, 0, fc_nid, 1); if(has_bias) @@ -495,7 +495,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs, ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const FullyConnectedLayerInfo fc_info, - const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info) + const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint) { check_nodeidx_pair(input, g); ARM_COMPUTE_ERROR_ON(num_outputs == 0); @@ -523,7 +523,7 @@ NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, Node } // Create fully connected node and connect - NodeID fc_nid = g.add_node(num_outputs, out_quant_info, fc_info); + NodeID fc_nid = g.add_node(num_outputs, out_quant_info, fc_info, fast_math_hint); g.add_connection(input.node_id, input.index, fc_nid, 0); g.add_connection(w_nid, 0, fc_nid, 1); if(has_bias) diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp index 442f636b61..6278227878 100644 --- a/src/graph/nodes/FullyConnectedLayer.cpp +++ b/src/graph/nodes/FullyConnectedLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,12 +31,21 @@ namespace arm_compute { namespace graph { -FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info, FullyConnectedLayerInfo fc_info) - : _num_outputs(num_outputs), _out_quant_info(std::move(out_quant_info)), _info(fc_info) +FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info, FullyConnectedLayerInfo fc_info, FastMathHint fast_math_hint) + : _num_outputs(num_outputs), _out_quant_info(std::move(out_quant_info)), _info(fc_info), _fast_math_hint(fast_math_hint) { _input_edges.resize(3, EmptyEdgeID); _outputs.resize(1, NullTensorID); } +void FullyConnectedLayerNode::set_fast_math_hint(FastMathHint hint) +{ + _fast_math_hint = hint; +} + +FastMathHint FullyConnectedLayerNode::fast_math_hint() const +{ + return _fast_math_hint; +} void FullyConnectedLayerNode::set_fused_activation(ActivationLayerInfo fused_activation) { -- cgit v1.2.1