Optimize CpuGemmConv2d start-up time

When weight has no holes, we can replace CpuWeightsReshapeKernel with: - Collapse by reinterpreting weight's 3 spatial dimensions - Perform CpuTranspose For more details see the documentation in src/cpu/operators/CpuGemmConv2d.cpp This is one optimization since the CpuTranspose is better performing than CpuWeightsReshapeKernel A second optimization is to fuse this transpose with other weight transformations (e.g. pretranspose_B_array in CpuGemmAssemblyDispatch) However this second optimization depends on how the underlying gemm methods (the fall back path: CpuGemmMatrixMultiplyKernel or the assembly path: CpuGemmAssemblyDispatch) chooses to fuse the transpose. Therefore, this patch moves the transpose down from CpuGemmConv2d, to the individual gemm operators where the fusion decision needs to be made, by passing an extra "transpose_b" flag to CpuGemm New transpose_b flag in different scopes (they are all the same, but with different names because pretranspose_b has a different meaning in GemmAssemblyDispatch): GEMMInfo::pretranspose_B -> AsmGemmInfo::transpose_b New auxilliary tensors holding the transposed b result: - CpuGemm optimized path: CpuGemmAssemblyDispatch::PrePretransposedB - CpuGemm fallback path: CpuGemm::PreTransposedRHS Note that this patch does not yet have the second optimization (COMPMID-6595), but it prepares for it. Relates to COMPMID-6595 Resolves COMPMID-6499 Change-Id: I999a2da9da4b2b15369a3cc06d7872c86e0190ea Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10526 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Anitha Raj <Anitha.Raj@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2023-10-17 17:38:57 +0100
committer: SiCong Li <sicong.li@arm.com> 2023-11-08 09:49:56 +0000
commit: c5ab4df0c11dc66db47f2070edc719923af3367e (patch)
tree: c04bdac32528e628b2a9b9a1c1653e300328fc1b /arm_compute
parent: 4a9dbedfbfa66c2612c7461e60cd867b8aea825b (diff)
download: ComputeLibrary-c5ab4df0c11dc66db47f2070edc719923af3367e.tar.gz
2 files changed, 10 insertions, 6 deletions
diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
index c24762c0aa..a827c79fda 100644
--- a/arm_compute/function_info/GEMMInfo.h
+++ b/arm_compute/function_info/GEMMInfo.h
@@ -105,6 +105,7 @@ public:
      * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
      * @param[in] fixed_format                (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
      * @param[in] weight_format               (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
+     * @param[in] pretranspose_B              (Optional) Pretranspose matrix B (transposition of its lowest 2 dimensions), in addition to and before, any further transformations of B
      */
     GEMMInfo(bool                       is_a_reshaped,
              bool                       is_b_reshaped,
@@ -118,7 +119,8 @@ public:
              bool                       broadcast_bias          = false,
              const ActivationLayerInfo &activation_info         = ActivationLayerInfo(),
              bool                       fixed_format            = false,
-             arm_compute::WeightFormat  weight_format           = arm_compute::WeightFormat::UNSPECIFIED) noexcept
+             arm_compute::WeightFormat  weight_format           = arm_compute::WeightFormat::UNSPECIFIED,
+             bool                       pretranspose_B          = false) noexcept
         : _is_a_reshaped(is_a_reshaped),
           _is_b_reshaped(is_b_reshaped),
           _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
@@ -130,7 +132,7 @@ public:
           _fp_mixed_precision(fp_mixed_precision),
           _broadcast_bias(broadcast_bias),
           _pretranspose_A(false),
-          _pretranspose_B(false),
+          _pretranspose_B(pretranspose_B),
           _activation_info(activation_info),
           _fixed_format(fixed_format),
           _weight_format(weight_format)
@@ -251,6 +253,8 @@ public:
         _pretranspose_A = flag;
     }
     /** Flag which specifies whether b should be pre-transposed if supported.
+     * More concretely, the "pre-transpose" is the transposition of the b tensor's lowest 2 dimensions
+     * If specified true, this pre-transpose will occur in addition to and before, any further transformations of the b matrix
      *
      * @return True if b should be pre-transposed else false.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index cdf0f652e1..2d07980ade 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NECONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -38,7 +38,7 @@ namespace arm_compute
 class ITensor;
 
 /** Basic function to simulate a convolution layer. This function calls one of the following functions:
- * -# @ref cpu::CpuGemm     (executed only in case GEMM is required for the operation)
+ * -# @ref cpu::CpuGemmConv2d     (executed only in case GEMM is required for the operation)
  * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation)
  * -# @ref cpu::CpuDirectConv2d   (executed only in case Direct Convolution is required for the operation)
  * -# @ref NEFFTConvolutionLayer      (executed only in case FFT is required for the operation)
@@ -196,4 +196,4 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
author	SiCong Li <sicong.li@arm.com>	2023-10-17 17:38:57 +0100
committer	SiCong Li <sicong.li@arm.com>	2023-11-08 09:49:56 +0000
commit	c5ab4df0c11dc66db47f2070edc719923af3367e (patch)
tree	c04bdac32528e628b2a9b9a1c1653e300328fc1b /arm_compute
parent	4a9dbedfbfa66c2612c7461e60cd867b8aea825b (diff)
download	ComputeLibrary-c5ab4df0c11dc66db47f2070edc719923af3367e.tar.gz