aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-16 16:16:43 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-22 02:25:50 +0000
commit4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 (patch)
tree2f8362d33cdad4212f4b96995681c68184c759e1 /src/runtime
parent59fd7a722e5bc7e85309d6200bc37a772721a719 (diff)
downloadComputeLibrary-4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2.tar.gz
Update GEMM assembly kernels
- Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/CL/functions/CLFullyConnectedLayer.cpp2
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp2
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp2
-rw-r--r--src/runtime/cpu/operators/CpuGemm.cpp1
-rw-r--r--src/runtime/cpu/operators/CpuGemmConvolution.cpp9
-rw-r--r--src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp1
-rw-r--r--src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp1
-rw-r--r--src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp10
-rw-r--r--src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h1
9 files changed, 19 insertions, 10 deletions
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index bc9a3056e8..0647a473e2 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -101,6 +101,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
fc_info.retain_internal_weights, // retain_internal_weights
gemmlowp_output_stage, // gemmlowp_output_stage
fc_info.fp_mixed_precision, // fp_mixed_precision
+ false, // fast_math
true, // broadcast_bias
ActivationLayerInfo()); // activation_info
@@ -151,6 +152,7 @@ void CLFullyConnectedLayer::configure_mm(const CLCompileContext &compile_context
fc_info.retain_internal_weights, // retain_internal_weights
gemmlowp_output_stage, // gemmlowp_output_stage
fc_info.fp_mixed_precision, // fp_mixed_precision
+ false, // fast_math
true, // broadcast_bias
fc_info.activation_info, // activation_info
fc_info.constant_weights); // constant_weights
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index f926b1d0a6..16735dde0e 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -128,6 +128,7 @@ void CLGEMMConvolutionLayer::configure_mm(const CLCompileContext &compile_contex
false, // retain_internal_weights
gemmlowp_output_stage, // gemmlowp_output_stage
false, // fp_mixed_precision
+ false, // fast_math
true, // broadcast_bias
act_info); // activation_info
@@ -167,6 +168,7 @@ Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
false, // retain_internal_weights
gemmlowp_output_stage, // gemmlowp_output_stage
false, // fp_mixed_precision
+ false, // fast_math
true, // broadcast_bias
act_info); // activation_info
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 1022e397d0..e88bd3b5d4 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -67,7 +67,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
case ConvolutionMethod::GEMM:
{
auto f = std::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
+ f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
_function = std::move(f);
break;
}
diff --git a/src/runtime/cpu/operators/CpuGemm.cpp b/src/runtime/cpu/operators/CpuGemm.cpp
index 9a4d171ce6..c6abe1f893 100644
--- a/src/runtime/cpu/operators/CpuGemm.cpp
+++ b/src/runtime/cpu/operators/CpuGemm.cpp
@@ -48,6 +48,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
asm_info.activation_info = info.activation_info();
+ asm_info.fast_mode = info.fast_math();
return asm_info;
}
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
index a0424b1c63..fcdf8aa8f6 100644
--- a/src/runtime/cpu/operators/CpuGemmConvolution.cpp
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
@@ -66,7 +66,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo
// Create GEMMInfo structure
const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
+ false, GEMMLowpOutputStageInfo(), false, false, false, act_info);
// Supported activations in GEMM
const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
@@ -115,7 +115,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo
quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
- _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, false, act_info));
auto mm_mem_req = _mm_gemmlowp->workspace();
for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -146,7 +146,7 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo
// Create GEMMInfo structure
const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
+ false, GEMMLowpOutputStageInfo(), false, false, false, act_info);
if(is_quantized)
{
@@ -186,7 +186,8 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo
std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
- return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
+ return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, false,
+ act_info));
}
else
{
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
index c2e9f24ff6..10eece99eb 100644
--- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -86,6 +86,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect
asm_info.padding_left = info.conv_info.pad_left();
asm_info.padding_value = 0.f;
asm_info.negated_offsets = false;
+ asm_info.fast_mode = info.enable_fast_math;
return asm_info;
}
} // namespace
diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 651ce436a0..56eb4fbb87 100644
--- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -63,6 +63,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
asm_info.activation_info = info.activation_info();
asm_info.output_stage = info.gemmlowp_output_stage();
+ asm_info.fast_mode = info.fast_math();
return asm_info;
}
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 79ea1cb5a7..bbbd5ac458 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -542,7 +542,7 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
const CPUInfo &ci = NEScheduler::get().cpu_info();
unsigned int num_threads = NEScheduler::get().num_threads();
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -556,11 +556,11 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm::Activation activation, const AsmGemmInfo &info)
{
ARM_COMPUTE_UNUSED(activation);
- Params p = extract_parameters(a, b, d, info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index 355273adeb..88cfed002a 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -51,6 +51,7 @@ struct AsmGemmInfo
int64_t padding_top{ 0 };
int64_t padding_left{ 0 };
float padding_value{ 0.f };
+ bool fast_mode{ false };
};
/** Assembly kernel glue */