COMPMID-481: Add AArch32 GEMM

Change-Id: Idba0b30bfb27866a46a22388014ab81432ea28dc Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86196 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
author: Moritz Pflanzer <moritz.pflanzer@arm.com> 2017-09-15 10:42:58 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:35:24 +0000
commit: 80373f607cb12693824411510c39e367a4dfbdb5 (patch)
tree: ddc4d038783ed91ff227fb259a85fefc09e46319 /src/runtime/NEON/functions
parent: c09314a288dc2aa7ef75a09a8ff5dede3f80974a (diff)
download: ComputeLibrary-80373f607cb12693824411510c39e367a4dfbdb5.tar.gz
2 files changed, 35 insertions, 13 deletions
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 44bf2de70c..cbe3b65c34 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
@@ -34,6 +35,7 @@
 namespace arm_compute
 {
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
 } // namespace arm_compute
 
@@ -151,12 +153,17 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
     // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
 
-#if defined(__aarch64__)
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    {
+        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+    }
+#elif defined(__aarch64__)
     if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
     {
         _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
     }
-#endif /* defined(__aarch64__) */
+#endif /* defined(__arm__) || defined(__aarch64__) */
 
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
@@ -240,7 +247,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
     // Configure kernels
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
 
-#if defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__)
     if(_mm_optimised_kernel != nullptr)
     {
         struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -249,7 +256,11 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
         const int N = _gemm_output.info()->tensor_shape().x();
         const int K = _input_im2col_reshaped.info()->tensor_shape().x();
 
+#if defined(__arm__)
+        GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
         GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
 
         constexpr size_t alignment = 4096;
         _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
@@ -268,7 +279,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
         _workspace.allocator()->allocate();
     }
     else
-#endif /* defined(__aarch64__) */
+#endif /* defined(__arm__) || defined(__aarch64__) */
     {
         if(_is_fully_connected_convolution)
         {
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 1d6aa65e37..ff92ef8351 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -37,6 +38,7 @@
 namespace arm_compute
 {
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
 } // namespace arm_compute
 
@@ -68,13 +70,6 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
 
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
-#if defined(__aarch64__)
-    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
-    {
-        _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
-    }
-#endif /* defined(__aarch64__) */
-
     // Check if the first input tensor is a vector.
     // If so, all the kernels for reshaping the tensors can be skipped
     if(_run_vector_matrix_multiplication)
@@ -91,7 +86,19 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
     }
     else
     {
-#if defined(__aarch64__)
+#if defined(__arm__)
+        if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+        {
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
+        }
+#elif defined(__aarch64__)
+        if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+        {
+            _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
+        }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+#if defined(__arm__) || defined(__aarch64__)
         if(_mm_optimised_kernel != nullptr)
         {
             struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -100,7 +107,11 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
             const int N = d->info()->tensor_shape().x();
             const int K = a->info()->tensor_shape().x();
 
+#if defined(__arm__)
+            GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
             GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
 
             constexpr size_t alignment = 4096;
             _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
@@ -112,7 +123,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
             _workspace.allocator()->allocate();
         }
         else
-#endif /* defined(__aarch64__) */
+#endif /* defined(__arm__) || defined(__aarch64__) */
         {
             TensorShape shape_tmp_a = a->info()->tensor_shape();
             TensorShape shape_tmp_b = b->info()->tensor_shape();
author	Moritz Pflanzer <moritz.pflanzer@arm.com>	2017-09-15 10:42:58 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:35:24 +0000
commit	80373f607cb12693824411510c39e367a4dfbdb5 (patch)
tree	ddc4d038783ed91ff227fb259a85fefc09e46319 /src/runtime/NEON/functions
parent	c09314a288dc2aa7ef75a09a8ff5dede3f80974a (diff)
download	ComputeLibrary-80373f607cb12693824411510c39e367a4dfbdb5.tar.gz