aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/NEON/functions/NEConvolutionLayer.cpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-02-13 12:15:13 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:47:18 +0000
commit284cfe2e3a44e5b20978e561c96c94d2193e93a1 (patch)
tree204cb044578d66c89b3a60d0a3c8c7920c8a768e /src/runtime/NEON/functions/NEConvolutionLayer.cpp
parentf29975848a384fc127cf5401683fc246bab0d903 (diff)
downloadComputeLibrary-284cfe2e3a44e5b20978e561c96c94d2193e93a1.tar.gz
COMPMID-903: Implements NEPermute for NHWC conversions
Change-Id: I4083e8d16bb23933634f229a1408dfd0e8f2922a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120069 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/runtime/NEON/functions/NEConvolutionLayer.cpp')
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp41
1 files changed, 30 insertions, 11 deletions
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index c6f99782ec..f790f6a95f 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
@@ -255,6 +256,25 @@ void NEConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weigh
}
}
+void NEConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
+{
+ ARM_COMPUTE_UNUSED(ci);
+ ARM_COMPUTE_UNUSED(M);
+ ARM_COMPUTE_UNUSED(N);
+ ARM_COMPUTE_UNUSED(K);
+#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__)
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+#elif defined(__aarch64__)
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+ constexpr size_t alignment = 4096;
+ _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+ _memory_group.manage(&_workspace);
+#endif /* defined(__arm__) || defined(__aarch64__) */
+}
+
void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
// Perform validate step
@@ -384,7 +404,6 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
// Configure matrix multiply
-#if defined(__arm__) || defined(__aarch64__)
if(_mm_optimised_kernel != nullptr)
{
struct CPUInfo ci = NEScheduler::get().cpu_info();
@@ -393,15 +412,16 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
const int N = _gemm_output.info()->tensor_shape().x();
const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-#if defined(__arm__)
- GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
- GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
+#if defined(__aarch64__)
+ if((N <= 128) && (K <= 128))
+ {
+ _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
+ }
+ else
+#endif /* defined(__aarch64__) */
+ {
+ configure_asm_mm(ci, M, N, K);
+ }
// Configure matrix multiplication kernel
_mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
@@ -409,7 +429,6 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
_workspace.allocator()->allocate();
}
else
-#endif /* defined(__arm__) || defined(__aarch64__) */
{
if(_is_interleaved_transposed)
{