aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2018-02-23 13:43:50 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:49:16 +0000
commiteb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0 (patch)
tree42cca378eed97c07348f28e1ec708d9c7ed531ce
parent8df6c452820719d201ee79596cde8445c2071db5 (diff)
downloadComputeLibrary-eb82fd2aa786715c3b6a941dc6d6deac4ce8e2a0.tar.gz
COMPMID-881: RSH new arm_gemm interface.
Change-Id: I1e2a1a77097d8017c274af3f97eba6964f80f5fa Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122592 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r--SConscript7
-rw-r--r--SConstruct6
-rw-r--r--arm_compute/core/NEON/NEKernels.h8
-rw-r--r--arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h48
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h60
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h61
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h62
-rw-r--r--arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h91
-rw-r--r--arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp (renamed from arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h)33
-rw-r--r--arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp29
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_common.hpp79
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp177
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp101
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp410
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp413
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp350
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp313
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp398
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h66
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp363
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp465
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp314
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp396
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h66
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp354
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp281
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp384
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp337
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp368
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp368
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp360
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp358
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp913
-rw-r--r--arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp170
-rw-r--r--arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp236
-rw-r--r--arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp410
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp127
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp162
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp189
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp145
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp120
-rw-r--r--arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp139
-rw-r--r--arm_compute/runtime/NEON/AssemblyHelper.h173
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMM.h29
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h11
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h5
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h6
-rw-r--r--docs/00_introduction.dox12
-rw-r--r--examples/graph_inception_v4.cpp12
-rwxr-xr-xscripts/check_bad_style.sh16
-rwxr-xr-xscripts/clang_tidy_rules.py5
-rw-r--r--src/core/GLES_COMPUTE/GCKernelLibrary.cpp4
-rw-r--r--src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp127
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp127
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp121
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp201
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp199
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp198
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp130
-rw-r--r--src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp132
-rw-r--r--src/core/NEON/kernels/arm_gemm/asmlib.hpp (renamed from arm_compute/core/NEON/kernels/assembly/asmlib.hpp)91
-rw-r--r--src/core/NEON/kernels/arm_gemm/buffer_manager.hpp379
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp65
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp (renamed from arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h)40
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp63
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp535
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_native.hpp102
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp (renamed from arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h)40
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp64
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp107
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp146
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp)38
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp400
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp398
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp346
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp)29
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp309
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp)43
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp356
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h66
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp343
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp)38
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp456
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp)29
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp309
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp)43
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp356
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h66
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp343
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp)35
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp273
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp360
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp337
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp)62
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp363
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp356
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp342
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp350
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp64
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp734
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp67
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp794
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp (renamed from arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp)19
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp913
-rw-r--r--src/core/NEON/kernels/arm_gemm/mergeresults.hpp (renamed from arm_compute/core/NEON/kernels/assembly/mergeresults.hpp)38
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp167
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp233
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp273
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp233
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp289
-rw-r--r--src/core/NEON/kernels/arm_gemm/merges/list.hpp (renamed from arm_compute/core/NEON/kernels/assembly/merges/list.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/misc.cpp147
-rw-r--r--src/core/NEON/kernels/arm_gemm/profiler.hpp (renamed from arm_compute/core/NEON/kernels/assembly/profiler.hpp)90
-rw-r--r--src/core/NEON/kernels/arm_gemm/transform.hpp (renamed from arm_compute/core/NEON/kernels/assembly/transform.hpp)68
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp (renamed from arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp)114
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp116
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp (renamed from arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp)83
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp170
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp (renamed from arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp)68
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp192
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp135
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp113
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp (renamed from arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp)113
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/list.hpp (renamed from arm_compute/core/NEON/kernels/assembly/transforms/list.hpp)0
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp160
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp (renamed from arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h)39
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp118
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp79
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp117
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp120
-rw-r--r--tests/validation/NEON/GEMMLowp.cpp3
133 files changed, 14092 insertions, 11243 deletions
diff --git a/SConscript b/SConscript
index dab807a47f..cf2dd878e5 100644
--- a/SConscript
+++ b/SConscript
@@ -194,6 +194,8 @@ if env['neon']:
core_files += Glob('src/core/NEON/*.cpp')
core_files += Glob('src/core/NEON/kernels/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp')
+
# build winograd sources for either v7a / v8a
core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp')
core_files += Glob('src/core/NEON/kernels/convolution/winograd/*/*.cpp')
@@ -202,10 +204,11 @@ if env['neon']:
graph2_files += Glob('src/graph2/backends/NEON/*.cpp')
if env['arch'] == "armv7a":
- core_files += Glob('src/core/NEON/kernels/arm32/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a32_*/*.cpp')
+
if "arm64-v8" in env['arch']:
- core_files += Glob('src/core/NEON/kernels/arm64/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp')
runtime_files += Glob('src/runtime/NEON/*.cpp')
runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
diff --git a/SConstruct b/SConstruct
index 7667132bd9..4bf90c107d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -135,19 +135,19 @@ if env['arch'] == 'armv7a':
env.Append(CXXFLAGS = ['-mfloat-abi=softfp'])
elif env['arch'] == 'arm64-v8a':
env.Append(CXXFLAGS = ['-march=armv8-a'])
- env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8A'])
+ env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8A','NO_DOT_IN_TOOLCHAIN'])
if env['os'] == 'linux':
prefix = "aarch64-linux-gnu-"
elif env['os'] == 'bare_metal':
prefix = "aarch64-elf-"
elif env['os'] == 'android':
prefix = "aarch64-linux-android-"
+ env.Append(CXXFLAGS = ['-no-integrated-as'])
elif env['arch'] == 'arm64-v8.2-a':
env.Append(CXXFLAGS = ['-march=armv8.2-a+fp16']) # explicitly enable fp16 extension otherwise __ARM_FEATURE_FP16_VECTOR_ARITHMETIC is undefined
- env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2'])
+ env.Append(CPPDEFINES = ['ARM_COMPUTE_AARCH64_V8_2','NO_DOT_IN_TOOLCHAIN'])
if 'clang++' in cpp_compiler:
env.Append(CXXFLAGS = ['-fno-integrated-as'])
-
if env['os'] == 'linux':
prefix = "aarch64-linux-gnu-"
elif env['os'] == 'bare_metal':
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 5c15e5ecc4..7ec74eaccd 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -113,13 +113,5 @@
#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
#endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
deleted file mode 100644
index 4868f83d74..0000000000
--- a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch32/armv7a NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMAArch32Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMAArch32Kernel";
- }
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
deleted file mode 100644
index 83c209d48f..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64A53Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpAArch64A53Kernel";
- }
- /** Default constructor */
- NEGEMMLowpAArch64A53Kernel();
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-
-private:
- using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
- const Window &window,
- const ThreadInfo &info);
- NEGEMMLowpAArch64A53 *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
-#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
deleted file mode 100644
index f813242fc9..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpAArch64Kernel";
- }
- /** Default constructor */
- NEGEMMLowpAArch64Kernel();
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-
-private:
- using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1, const Window &window,
- const ThreadInfo &info);
- NEGEMMLowpAArch64 *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
-#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
deleted file mode 100644
index b854d3a9aa..0000000000
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
-
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-
-// Enable only if compiled for AArch64-V8.2-A targets
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-
-namespace arm_compute
-{
-class ITensor;
-
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMLowpAArch64V8P4Kernel";
- }
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel
- *
- * The computed function is C = a * AxB + b * C.
- *
- * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8
- * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0
- * @param[in] output Output tensor info to store the result of matrix multiplication.
- * If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
-#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h
new file mode 100644
index 0000000000..646cc7861a
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__
+#define __ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Utils.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** This class is a wrapper for the assembly kernels.
+ *
+ * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
+ * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
+ * of NEGEMMAssemblyWrapper and other auxiliary data structures to execute a single assembly kernel
+ * in the context of an NEFunctions.
+ *
+ * The type T is the type of the actual kernel implemented in assembly which is of type
+ * template<typename To, typename Tr> class GemmCommon
+ *
+ *
+ */
+template<typename T>
+class NEGEMMAssemblyWrapper final : public INEKernel
+{
+public:
+ /** Constructor
+ */
+ NEGEMMAssemblyWrapper() : _kernel(nullptr) {}
+
+ NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &) = delete;
+ NEGEMMAssemblyWrapper(NEGEMMAssemblyWrapper &&) = default;
+ NEGEMMAssemblyWrapper & operator=(NEGEMMAssemblyWrapper &) = delete;
+
+ const char *name() const override
+ {
+ return "NEGEMMAssemblyWrapper";
+ }
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void*>(_kernel)));
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ auto first = window.x().start();
+ auto last = window.x().end();
+ _kernel->execute(first, last, info.thread_id);
+ }
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] kernel Pointer to an assembly kernel implementation.
+ * @param[in] num_threads Number of concurrent threads which will execute the kernel.
+ */
+ void configure(T *kernel)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void*>(kernel)));
+ _kernel = kernel;
+ auto win_last = _kernel->get_window_size();
+ Window win;
+ win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+ INEKernel::configure(win);
+ }
+private:
+ T* _kernel;
+};
+
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
index ba78aae9f4..d6c9931a21 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -21,28 +21,19 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__
-#define __ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__
+#pragma once
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include <memory>
-namespace arm_compute
-{
-class ITensor;
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
-/** Native AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMAArch64NativeKernel : public NEGEMMAssemblyBaseKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMAArch64NativeKernel";
- }
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
+namespace arm_gemm {
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMAARCH64NATIVEKERNEL_H__*/
+template<typename Top, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
+
+template<typename Top, typename Tret>
+UniqueGemmCommon<Top, Tret> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB, const Tret alpha, const Tret beta, const int maxthreads, const bool pretransposed_hint);
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
new file mode 100644
index 0000000000..a608566634
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* This file is used to configure integration-specific aspects of arm_gemm, this is the gemm-linux version */
+
+/* Our CPUInfo is defined in newgemm_lib.hpp */
+#include "newgemm_lib.hpp"
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index ef89e3aac3..7f47abcbb9 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,11 +23,82 @@
*/
#pragma once
-// Abstract class for a GEMM function
+#include <cstddef>
+
+namespace arm_gemm {
+
+// Abstract class for the GEMM/GEMV functions.
+//
+// GEMM implementations may be "native" (never require any input
+// permutation), "pretransposed" (require permutation up-front) or require
+// working space (permute as they go along). This interface should support
+// all of them.
+
template<typename To, typename Tr>
class GemmCommon {
+protected:
+ const To *_Aptr=nullptr;
+ int _lda=0;
+ const To *_Bptr=nullptr;
+ int _ldb=0;
+ Tr *_Cptr=nullptr;
+ int _ldc=0;
+
public:
- virtual size_t get_working_size() const = 0;
- virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space) const = 0;
+ /* Pass in the pointers to the arrays to be operated on and their
+ * strides. This has a default implementation that just captures them
+ * all in protected members. If B is pretransposed (see below) then the
+ * settings for B here are ignored. */
+ virtual void set_arrays(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc) {
+ _Aptr = A;
+ _lda = lda;
+ _Bptr = B;
+ _ldb = ldb;
+ _Cptr = C;
+ _ldc = ldc;
+ }
+
+ /* For threading, we divide the work into some number of units and work
+ * out internally what unit corresponds to what work. This returns the
+ * total number of units. */
+ virtual unsigned int get_window_size() const = 0;
+
+ /* The maximum thread count is specified when the GEMM is created. Some
+ * implementations need to know how many threads will actually run in
+ * order to work properly.
+ *
+ * In some cases, after creating the GEMM the number of threads needs to
+ * be reduced (e.g. not enough work to split across threads). This
+ * method allows the number of actual threads to be run to be set (must
+ * be equal or lower).
+ *
+ * This has an empty default implementation, as GEMMs which don't care
+ * about thread count can safely ignore this.
+ */
+ virtual void set_nthreads(int nthreads) { };
+
+ /* Actually do the work. Provide a threadid to index any per-thread
+ * buffers, and a start/end range to indicate which work to do. */
+ virtual void execute(unsigned int start, unsigned int end, int threadid) = 0;
+
+ /*** Working space interface (optional) ***/
+ /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */
+ virtual size_t get_working_size() const { return 0; }
+ /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
+ virtual void set_working_space(void *) { };
+
+ /*** "Pretransposed" interface (optional) ***/
+ /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */
+ virtual bool B_is_pretransposed() const { return false; }
+ /* Does pretranspose still need to be done? */
+ virtual bool B_pretranspose_required() const { return false; }
+ /* Total number of bytes of space needed for pretransposed arrays. */
+ virtual size_t get_B_pretransposed_array_size() const { return 0; }
+ /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
+ virtual void pretranspose_B_array(void *buffer, const To *, const int) { };
+
+ // Destructor
virtual ~GemmCommon() { }
};
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
deleted file mode 100644
index 659ef837f5..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <stdio.h>
-#include <cassert>
-
-#include "gemm_common.hpp"
-#include "profiler.hpp"
-#include "transform.hpp"
-#include "mergeresults.hpp"
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND 64
-#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This implementation interleaves the source matrices in blocks - good for
-// larger matrices.
-template<typename strategy, typename To, typename Tr>
-class GemmInterleaved : public GemmCommon<To, Tr> {
- typedef typename strategy::operand_type Toi;
- typedef typename strategy::result_type Tri;
-
- const unsigned int M;
- const unsigned int N;
- const unsigned int K;
-
- const bool trA;
- const bool trB;
-
- const strategy strat;
-
- unsigned int k_block = 0;
- unsigned int x_block = 0;
- unsigned int Mround = 0;
-
- size_t get_a_working_size() const {
- return ROUND_UP(sizeof(Toi) * k_block * Mround);
- }
-
- size_t get_b_working_size() const {
- return ROUND_UP(sizeof(Toi) * x_block * k_block);
- }
-
- size_t get_c_working_size() const {
- return ROUND_UP(sizeof(Tri) * x_block * strat.out_height);
- }
-
-public:
- size_t get_working_size() const override {
- return get_a_working_size() + get_b_working_size() + get_c_working_size();
- }
-
- GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const bool trA, const bool trB) : M(M), N(N), K(K), trA(trA), trB(trB), strat(ci) {
- const unsigned int L1_size = ci->L1_size;
- const unsigned int L2_size = ci->L2_size;
-
- // Work out blocking parameters
- // k_block: Each iteration will consume (out_width + out_height)
- // operands - so how many iterations will fill the L1?
- k_block = L1_size / (sizeof(Toi) * (strat.out_width + strat.out_height));
-
- // Needs to be a multiple of the K unroll level.
- k_block /= strat.k_unroll;
- k_block *= strat.k_unroll;
-
- // Now tune to presented problem size; this is how many blocks we need.
- int num_k_blocks = (K + (k_block - 1)) / k_block;
-
- // So divide the space equally into that many blocks.
- k_block = (K + num_k_blocks - 1) / num_k_blocks;
-
- // And round UP to the K unroll level required.
- k_block = (k_block + strat.k_unroll - 1) / strat.k_unroll;
- k_block *= strat.k_unroll;
-
- // x_block: Work out how many rows (of length k_block) will fit in the L2
- x_block = L2_size / (sizeof(Toi) * k_block);
-
- // Needs to be a multiple of the kernel output width.
- x_block /= strat.out_width;
- x_block *= strat.out_width;
-
- // And tune to the presented problem size.
- int num_x_blocks = (N + (x_block - 1)) / x_block;
- x_block = (N + num_x_blocks - 1) / num_x_blocks;
-
- x_block = (x_block + strat.out_width - 1) / strat.out_width;
- x_block *= strat.out_width;
-
- // Work out the rounded size of M - needed for some buffers.
- Mround = (M + (strat.out_height - 1)) / strat.out_height;
- Mround *= strat.out_height;
-
- }
-
- // Actually execute the GEMM.
- void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
- assert(working_space);
- profiler prof;
- int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
- intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
- size_t diff = 0;
-
- if (working_space_int & 0xF) {
- diff = 0x10 - (working_space_int & 0xF);
- }
-
- Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
- Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
- Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
-
- for (unsigned int k0=0; k0<K; k0 += k_block) {
- unsigned int kmax = k0 + k_block;
- if (kmax > K) kmax = K;
-
- // Figure out how many "K" the kernel will actually process.
- int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
- kern_k *= strat.k_unroll;
-
- prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) {
- if (trA ^ strategy::A_transpose) {
- Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
- } else {
- Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, A, lda, 0, M, k0, kmax);
- }
- });
-
- for (unsigned int x0=0; x0<N; x0 += x_block) {
- unsigned int xmax = x0 + x_block;
- if (xmax > N) xmax = N;
-
- int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
-
- prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) {
- if (trB ^ strategy::B_transpose) {
- Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
- } else {
- Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, B, ldb, x0, xmax, k0, kmax);
- }
- });
-
- for (unsigned int y=0; y<M; y+=strat.out_height) {
- unsigned int ymax = y + strat.out_height;
- if (ymax > M) ymax = M;
-
- prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
- prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
- }
- }
- }
- }
-};
diff --git a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp b/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp
deleted file mode 100644
index 098fdaa7ac..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <stdio.h>
-
-#include "gemm_common.hpp"
-
-#include "profiler.hpp"
-#include "transform.hpp"
-#include "mergeresults.hpp"
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND 64
-#define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This is implementation is for GEMV with a transposed matrix.
-//
-// By default the source data is used in-place, but if type conversion is
-// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
-
-template<typename strategy, typename To, typename Tr>
-class GemvTransposed : public GemmCommon<To, Tr> {
- typedef typename strategy::operand_type Toi;
- typedef typename strategy::result_type Tri;
-
- const unsigned int N;
- const unsigned int K;
-
- const strategy strat;
-
- unsigned int m_block;
- unsigned int n_block;
-
- size_t get_a_working_size() const {
- return ROUND_UP(sizeof(Toi) * m_block);
- }
-
- size_t get_b_working_size() const {
- return ROUND_UP(sizeof(Toi) * m_block * n_block);
- }
-
- size_t get_c_working_size() const {
- return ROUND_UP(sizeof(Tri) * n_block);
- }
-
-public:
- size_t get_working_size() const override {
- return get_a_working_size() + get_b_working_size() + get_c_working_size();
- }
-
- GemvTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K) : N(N), K(K), strat(ci) {
- /* For now don't do any blocking. TODO: figure out if we should. */
- m_block = K;
- n_block = N;
- }
-
- // Actually execute the GEMV.
- void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
- profiler prof;
-
- static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
- static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
-
- for (unsigned int m0=0; m0<K; m0+=m_block) {
- unsigned int mmax = m0 + m_block;
- if (mmax > K) mmax = K;
-
- for (unsigned int n0=0; n0<N; n0+=n_block) {
- unsigned int nmax = n0 + n_block;
- if (nmax > N) nmax = N;
-
- prof(PROFILE_KERNEL, ((mmax-m0) * (nmax-n0)), [&](void) { strat.kernel(B + (m0 * ldb) + n0, A + m0, C + n0, alpha, ldb, (mmax-m0), (nmax-n0)); });
- }
- }
- }
-};
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp
deleted file mode 100644
index 6bfbfc8742..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a53.hpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- printf("CIAO SONO IO, AMORE MIO!\n");
-
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- int tails = (K & 3);
- if (tails == 0) {
- tails = 4;
- }
- int k = ((K+3)/4) - 1;
-
- __asm __volatile (
- "vmov.i32 q4, #0\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]\n"
- "vmov.i32 q5, #0\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]\n"
- "vmov.i32 q6, #0\n"
- "ldr r0, [%[a_ptr], #0x10]\n"
- "vmov.i32 q7, #0\n"
- "ldr r1, [%[a_ptr], #0x14]\n"
- "vmov.i32 q8, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0x40]")
- "vmov.i32 q9, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0x40]")
- "vmov.i32 q10, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0x80]")
- "vmov.i32 q11, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0x80]")
- "vmov.i32 q12, #0\n"
- "vmov.i32 q13, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0xC0]")
- "vmov.i32 q14, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0XC0]")
- "vmov.i32 q15, #0\n"
- "cmp %[k], #0\n"
- "beq 6f\n"
-
- "1:\n"
- // Unroll 0
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmov d2, r0, r1\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "ldr r0, [%[b_ptr], #0x18]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "ldr r1, [%[b_ptr], #0x1C]\n"
- "vmla.f32 q6, q2, d1[0]\n"
-
- "vldr d3, [%[a_ptr], #0x18]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d1[1]\n"
- ASM_PREFETCH("[%[a_ptr], #0x100]")
- "vmla.f32 q8, q2, d2[0]\n"
- "vmla.f32 q9, q2, d2[1]\n"
-
- "vldr d4, [%[b_ptr], #0x20]\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "ldr r0, [%[b_ptr], #0x28]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "ldr r1, [%[b_ptr], #0x2C]\n"
- "vmla.f32 q12, q3, d1[0]\n"
-
- "vldr d0, [%[a_ptr], #0x20]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "ldr r0, [%[a_ptr], #0x28]\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "ldr r1, [%[a_ptr], #0x2C]\n"
- "vmla.f32 q15, q3, d2[1]\n"
-
- // Unroll 1
- "vldr d6, [%[b_ptr], #0x30]\n"
- "vmov d1, r0, r1\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "ldr r0, [%[b_ptr], #0x38]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "ldr r1, [%[b_ptr], #0x3C]\n"
- "vmla.f32 q6, q2, d0[0]\n"
-
- "vldr d2, [%[a_ptr], #0x30]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d0[1]\n"
- ASM_PREFETCH("[%[b_ptr], #0x100]")
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
-
- "vldr d4, [%[b_ptr], #0x40]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "ldr r0, [%[b_ptr], #0x48]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "ldr r1, [%[b_ptr], #0x4C]\n"
- "vmla.f32 q12, q3, d0[0]\n"
-
- "vldr d3, [%[a_ptr], #0x38]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "ldr r0, [%[a_ptr], #0x40]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "ldr r1, [%[a_ptr], #0x44]\n"
- "vmla.f32 q15, q3, d1[1]\n"
-
- // Unroll 2
- "vldr d6, [%[b_ptr], #0x50]\n"
- "vmov d0, r0, r1\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "ldr r0, [%[b_ptr], #0x58]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "ldr r1, [%[b_ptr], #0x5C]\n"
- "vmla.f32 q6, q2, d3[0]\n"
-
- "vldr d1, [%[a_ptr], #0x48]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d3[1]\n"
- ASM_PREFETCH("[%[a_ptr], #0x140]")
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
-
- "vldr d4, [%[b_ptr], #0x60]\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "ldr r0, [%[b_ptr], #0x68]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "ldr r1, [%[b_ptr], #0x6C]\n"
- "vmla.f32 q12, q3, d3[0]\n"
-
- "vldr d2, [%[a_ptr], #0x50]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "ldr r0, [%[a_ptr], #0x58]\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "ldr r1, [%[a_ptr], #0x5C]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "add %[a_ptr], %[a_ptr], #0x60\n"
-
- // Unroll 3
- "vldr d6, [%[b_ptr], #0x70]\n"
- "vmov d3, r0, r1\n"
- "vmla.f32 q4, q2, d1[0]\n"
- "ldr r0, [%[b_ptr], #0x78]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "ldr r1, [%[b_ptr], #0x7C]\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
-
- "vldr d0, [%[a_ptr], #0x00]\n"
- "vmov d7, r0, r1\n"
- "vmla.f32 q7, q2, d2[1]\n"
- ASM_PREFETCH("[%[b_ptr], #0xC0]")
- "vmla.f32 q8, q2, d3[0]\n"
- "vmla.f32 q9, q2, d3[1]\n"
-
- "vldr d4, [%[b_ptr], #0x00]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "ldr r0, [%[b_ptr], #0x08]\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "ldr r1, [%[b_ptr], #0x0C]\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "subs %[k], %[k], #1\n"
-
- "vldr d1, [%[a_ptr], #0x08]\n"
- "vmov d5, r0, r1\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "ldr r0, [%[a_ptr], #0x10]\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "ldr r1, [%[a_ptr], #0x14]\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "bne 1b\n"
-
- // "Tails" shows how many multiply blocks are needed at the
- // end, must be 1-4 inclusive. Bail out to alternative tail
- // immediately if it's 1.
- "6:\n"
- "subs %[tails], %[tails], #1\n"
- "beq 3f\n"
-
- // Detached final iteration - for now adapt the generic
- // tails rather than reimplementing for A53.
-
- // Unroll 0
- "vmov d2, r0, r1\n"
- "add %[a_ptr], %[a_ptr], #0x18\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "add %[b_ptr], %[b_ptr], #0x10\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 4f\n"
-
- // Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 5f\n"
-
- // Unroll 2
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
-
- // Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
-
- // tails==1 final tail
- "3:\n"
- "vmov d2, r0, r1\n"
- "add %[b_ptr], %[b_ptr], #0x10\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "add %[a_ptr], %[a_ptr], #0x18\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
-
- // tails==2 final tail
- "4:\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
-
- // tails==3 final tail
- "5:\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vld1.32 {d0}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
-
- "2:\n"
- "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
- );
- }
- }
-}
-
-#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp
deleted file mode 100644
index 4f0ef7cd21..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/a55r1.hpp
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- /* Work out starting values for "k" and "tails" in the inner loop. */
- int tails_initial = (K & 3);
- if (tails_initial == 0) {
- tails_initial = 4;
- }
-
- int k_initial = ((K+3)/4) - 1;
-
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- int tails = tails_initial;
- int k = k_initial;
-
- a_ptr = a_ptr0;
-
- __asm __volatile (
- "vldr d0, [%[a_ptr]]\n"
- "vmov.i32 q4, #0\n"
- "vldr d1, [%[a_ptr], #0x08]\n"
- "vmov.i32 q5, #0\n"
- "vldr d4, [%[b_ptr]]\n"
- "vmov.i32 q6, #0\n"
- "vldr d5, [%[b_ptr], #0x08]\n"
- "vmov.i32 q7, #0\n"
- "vldr d2, [%[a_ptr], #0x10]\n"
- "vmov.i32 q8, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0x40]")
- "vmov.i32 q9, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0x40]")
- "vmov.i32 q10, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0x80]")
- "vmov.i32 q11, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0x80]")
- "vmov.i32 q12, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0XC0]")
- "vmov.i32 q13, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0xC0]")
- "vmov.i32 q14, #0\n"
- ASM_PREFETCH("[%[b_ptr], #0x100]")
- "vmov.i32 q15, #0\n"
- ASM_PREFETCH("[%[a_ptr], #0x100]")
- "cmp %[k], #0\n"
- ASM_PREFETCH("[%[b_ptr], #0x140]")
- "beq 6f\n"
- ASM_PREFETCH("[%[b_ptr], #0x180]")
-
- "1:\n"
- // Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vldr d7, [%[b_ptr], #0x18]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vldr d3, [%[a_ptr], #0x18]\n"
- "vmla.f32 q7, q2, d1[1]\n"
- ASM_PREFETCH("[%[a_ptr], #0x140]")
- "vmla.f32 q8, q2, d2[0]\n"
- "subs %[k], %[k], #1\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vldr d4, [%[b_ptr], #0x20]\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vldr d5, [%[b_ptr], #0x28]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x20]\n"
- "vmla.f32 q12, q3, d1[0]\n"
-
- "vmla.f32 q13, q3, d1[1]\n"
- "vldr d1, [%[a_ptr], #0x28]\n"
- "vmla.f32 q14, q3, d2[0]\n"
-
- "vmla.f32 q15, q3, d2[1]\n"
- "vldr d6, [%[b_ptr], #0x30]\n"
-
- // Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vldr d7, [%[b_ptr], #0x38]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vldr d2, [%[a_ptr], #0x30]\n"
- "vmla.f32 q6, q2, d0[0]\n"
-
- "vmla.f32 q7, q2, d0[1]\n"
- ASM_PREFETCH("[%[b_ptr], #0x1C0]")
- "vmla.f32 q8, q2, d1[0]\n"
-
- "vmla.f32 q9, q2, d1[1]\n"
- "vldr d4, [%[b_ptr], #0x40]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vldr d5, [%[b_ptr], #0x48]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vldr d3, [%[a_ptr], #0x38]\n"
- "vmla.f32 q12, q3, d0[0]\n"
-
- "vmla.f32 q13, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x40]\n"
- "vmla.f32 q14, q3, d1[0]\n"
-
- "vmla.f32 q15, q3, d1[1]\n"
- "vldr d6, [%[b_ptr], #0x50]\n"
-
- // Unroll 2
- "vmla.f32 q4, q2, d2[0]\n"
- "vldr d7, [%[b_ptr], #0x58]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vldr d1, [%[a_ptr], #0x48]\n"
- "vmla.f32 q6, q2, d3[0]\n"
-
- "vmla.f32 q7, q2, d3[1]\n"
- ASM_PREFETCH("[%[a_ptr], #0x180]")
- "vmla.f32 q8, q2, d0[0]\n"
-
- "vmla.f32 q9, q2, d0[1]\n"
- "vldr d4, [%[b_ptr], #0x60]\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vldr d5, [%[b_ptr], #0x68]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vldr d2, [%[a_ptr], #0x50]\n"
- "vmla.f32 q12, q3, d3[0]\n"
-
- "vmla.f32 q13, q3, d3[1]\n"
- "vldr d3, [%[a_ptr], #0x58]\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "add %[a_ptr], %[a_ptr], #0x60\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vldr d6, [%[b_ptr], #0x70]\n"
-
- // Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vldr d7, [%[b_ptr], #0x78]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vldr d0, [%[a_ptr], #0x00]\n"
- "vmla.f32 q7, q2, d2[1]\n"
- ASM_PREFETCH("[%[b_ptr], #0x180]")
- "vmla.f32 q8, q2, d3[0]\n"
-
- "vmla.f32 q9, q2, d3[1]\n"
- "vldr d4, [%[b_ptr], #0x00]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vldr d5, [%[b_ptr], #0x08]\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vldr d1, [%[a_ptr], #0x08]\n"
- "vmla.f32 q12, q3, d2[0]\n"
-
- "vmla.f32 q13, q3, d2[1]\n"
- "vldr d2, [%[a_ptr], #0x10]\n"
- "vmla.f32 q14, q3, d3[0]\n"
-
- "vmla.f32 q15, q3, d3[1]\n"
- "bne 1b\n"
-
- // "Tails" shows how many multiply blocks are needed at the
- // end, must be 1-4 inclusive. Bail out to alternative tail
- // immediately if it's 1.
- "6:\n"
- "subs %[tails], %[tails], #1\n"
- "beq 3f\n"
-
- // Detached final iteration
-
- // Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vldr d7, [%[b_ptr], #0x18]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vldr d3, [%[a_ptr], #0x18]\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vldr d4, [%[b_ptr], #0x20]\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vldr d5, [%[b_ptr], #0x28]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x20]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vldr d1, [%[a_ptr], #0x28]\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "beq 4f\n"
-
- // Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vldr d6, [%[b_ptr], #0x30]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vldr d7, [%[b_ptr], #0x38]\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vldr d2, [%[a_ptr], #0x30]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q8, q2, d1[0]\n"
-
- "vmla.f32 q9, q2, d1[1]\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vldr d4, [%[b_ptr], #0x40]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vldr d5, [%[b_ptr], #0x48]\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vldr d3, [%[a_ptr], #0x38]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vldr d0, [%[a_ptr], #0x40]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "beq 5f\n"
-
- // Unroll 2
- "vmla.f32 q4, q2, d2[0]\n"
- "vldr d6, [%[b_ptr], #0x50]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vldr d7, [%[b_ptr], #0x58]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vldr d1, [%[a_ptr], #0x48]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vldr d4, [%[b_ptr], #0x60]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vldr d5, [%[b_ptr], #0x68]\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vldr d2, [%[a_ptr], #0x50]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vldr d3, [%[a_ptr], #0x58]\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
-
- // Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vldr d6, [%[b_ptr], #0x70]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vldr d7, [%[b_ptr], #0x78]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x60\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x80\n"
- "b 2f\n"
-
- // tails==1 final tail
- "3:\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "vldr d6, [%[b_ptr], #0x10]\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vldr d7, [%[b_ptr], #0x18]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x18\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x20\n"
- "b 2f\n"
-
- // tails==2 final tail
- "4:\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "vldr d6, [%[b_ptr], #0x30]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vldr d7, [%[b_ptr], #0x38]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x40\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x30\n"
- "b 2f\n"
-
- // tails==3 final tail
- "5:\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vldr d6, [%[b_ptr], #0x50]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vldr d7, [%[b_ptr], #0x58]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "add %[a_ptr], %[a_ptr], #0x48\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
-
- "2:\n"
- "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1"
- );
- }
- }
-}
-
-#endif /* __arm__ */
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
deleted file mode 100644
index 7a44fed5b2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6/generic.hpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "../../asmlib.hpp"
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 8x6), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- int tails = (K & 3);
- if (tails == 0) {
- tails = 4;
- }
- int k = ((K+3)/4) - 1;
-
- __asm __volatile (
- "vmov.i32 q4, #0\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmov.i32 q5, #0\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
- "vmov.i32 q6, #0\n"
- ASM_PREFETCH("[%[a_ptr], #48]")
- "vmov.i32 q7, #0\n"
- ASM_PREFETCH("[%[b_ptr], #48]")
- "vmov.i32 q8, #0\n"
- ASM_PREFETCH("[%[a_ptr], #112]")
- "vmov.i32 q9, #0\n"
- ASM_PREFETCH("[%[b_ptr], #112]")
- "vmov.i32 q10, #0\n"
- "vmov.i32 q11, #0\n"
- "vmov.i32 q12, #0\n"
- "vmov.i32 q13, #0\n"
- ASM_PREFETCH("[%[a_ptr], #176]")
- "vmov.i32 q14, #0\n"
- ASM_PREFETCH("[%[b_ptr], #176]")
- "vmov.i32 q15, #0\n"
-
- "cmp %[k], #0\n"
- "beq 6f\n"
-
- "1:\n"
- // Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
-
- // Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "subs %[k], %[k], #1\n"
- "vmla.f32 q5, q2, d3[1]\n"
- ASM_PREFETCH("[%[a_ptr], #208]")
- "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
-
- // Unroll 2
- "vmla.f32 q4, q2, d2[0]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- ASM_PREFETCH("[%[a_ptr], #240]")
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- ASM_PREFETCH("[%[b_ptr], #208]")
- "vmla.f32 q12, q3, d3[0]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
-
- // Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d1[0]\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "bne 1b\n"
-
- // Branch here if we never execute main loop.
- "6:\n"
-
- // "Tails" shows how many multiply blocks are needed at the
- // end, must be 1-4 inclusive. Bail out to alternative tail
- // immediately if it's 1.
- "subs %[tails], %[tails], #1\n"
- "beq 3f\n"
-
- // Detached final iteration
- // Unroll 0
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d0[0]\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 4f\n"
-
- // Unroll 1
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "subs %[tails], %[tails], #1\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d3[0]\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "beq 5f\n"
-
- // Unroll 2
- "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
-
- "vmla.f32 q10, q3, d2[0]\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
-
- // Unroll 3
- "vmla.f32 q4, q2, d1[0]\n"
- "vmla.f32 q10, q3, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d1[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d1[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d2[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d2[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d2[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d2[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d3[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d3[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d3[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d3[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
-
- // tails==1 final tail
- "3:\n"
- "vmla.f32 q4, q2, d0[0]\n"
- "vld1.32 {d2}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d0[1]\n"
- "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
- "vmla.f32 q6, q2, d1[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d0[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d0[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d1[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d1[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d1[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d2[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d2[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d2[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d2[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
-
- // tails==2 final tail
- "4:\n"
- "vmla.f32 q4, q2, d3[0]\n"
- "vmla.f32 q10, q3, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q5, q2, d3[1]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d3[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q6, q2, d0[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d0[0]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d0[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d0[1]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d1[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d1[0]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d1[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d1[1]\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
- "b 2f\n"
-
- // tails==3 final tail
- "5:\n"
- "vmla.f32 q4, q2, d2[0]\n"
- "vld1.32 {d0}, [%[a_ptr] :64]!\n"
- "vmla.f32 q5, q2, d2[1]\n"
- "vmla.f32 q6, q2, d3[0]\n"
- "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
- "vmla.f32 q10, q3, d2[0]\n"
- "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
- "vmla.f32 q11, q3, d2[1]\n"
- "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
- "vmla.f32 q12, q3, d3[0]\n"
- "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
- "vmla.f32 q7, q2, d3[1]\n"
- "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
- "vmla.f32 q13, q3, d3[1]\n"
- "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
- "vmla.f32 q8, q2, d0[0]\n"
- "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
- "vmla.f32 q14, q3, d0[0]\n"
- "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
- "vmla.f32 q9, q2, d0[1]\n"
- "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
- "vmla.f32 q15, q3, d0[1]\n"
- "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
- "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
-
- "2:\n"
- "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), [tails] "+r" (tails)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
deleted file mode 100644
index 10259b2fdf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <arm_neon.h>
-
-inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
-{
- const int16_t *a_ptr = Apanel;
- int32_t *c_ptr = Cpanel;
- for (int yb = 0; yb < ablocks; yb++)
- {
- const int16_t *a_ptr0 = a_ptr;
- const int16_t *b_ptr = Bpanel;
-
- for (int xb = 0; xb < bblocks; xb++)
- {
- a_ptr = a_ptr0;
- const bool odd_k = K & 0x1;
- int k = (K+1)/2 - 1;
-
- register int16x8_t aa asm("v0");
- register int16x8_t ab asm("v1");
- register int16x8_t b0 asm("v2");
- register int16x8_t b1 asm("v3");
- register int16x8_t b2 asm("v4");
-
- __asm __volatile (
- "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
- "movi v5.4s, #0\n"
- "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
- "movi v6.4s, #0\n"
- "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
- "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper
- "movi v7.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v8.4s, #0\n"
- "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
- "movi v9.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v10.4s, #0\n"
- "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
- "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper
- "movi v11.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #96]")
- "movi v12.4s, #0\n"
- "movi v13.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #96]")
- "movi v14.4s, #0\n"
- "movi v15.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0\n"
- "movi v17.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v18.4s, #0\n"
- "movi v19.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #160]")
- "movi v20.4s, #0\n"
- "movi v21.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #160]")
- "movi v22.4s, #0\n"
- "movi v23.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v24.4s, #0\n"
- "add %x[a_ptr], %x[a_ptr], #0x10\n"
- "movi v25.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v26.4s, #0\n"
- "add %x[b_ptr], %x[b_ptr], #0x18\n"
- "movi v27.4s, #0\n"
- "movi v28.4s, #0\n"
-
- "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
-
- "1:\n" // Main loop
- // First unroll
- "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
- "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
- "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
- "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
- "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
- "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
- "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
- "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
- "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
- "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
- "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
- "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
- "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
- "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
- "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
- "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
- "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
- "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
- "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
- "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
- "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
- "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
- "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
- "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
- "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
- "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
- "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
- "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
- "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
- "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
- "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
- "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
- // Second unroll
- "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
- "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
- "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper
- "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
- "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
- "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
- "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
- "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
- "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
- "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
- "add %x[a_ptr], %x[a_ptr], #0x20\n"
- "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
- "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
- "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
- "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
- "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
- "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
- "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
- "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
- "subs %x[k], %x[k], #0x1\n"
- "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
- "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
- "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
- "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper
- "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
- "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
- "add %x[b_ptr], %x[b_ptr], #0x30\n"
- "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
- "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
- "bne 1b\n"
-
- "2:\n" // Even tail
- "cbnz %x[odd_k], 3f\n"
-
- "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
- "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
- "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
- "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
- "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
- "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
- "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
- "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
- "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
- "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
- "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
- "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
- "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
- "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
- "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
- "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
- "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
- "add %[a_ptr], %[a_ptr], #0x10\n"
- "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
- "add %[b_ptr], %[b_ptr], #0x18\n"
- "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
- "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
- "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
- "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
- "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
- "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
- "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
- "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
- "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
- "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
- "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
- "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
- "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
- "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
- "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
- "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
- "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
- "str q5, [%x[c_ptr]]\n"
- "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
- "str q13, [%x[c_ptr], #0x10]\n"
- "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
- "str q21, [%x[c_ptr], #0x20]\n"
- "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
- "str q6, [%x[c_ptr], #0x30]\n"
- "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
- "str q14, [%x[c_ptr], #0x40]\n"
- "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
- "str q22, [%x[c_ptr], #0x50]\n"
- "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
- "str q7, [%x[c_ptr], #0x60]\n"
- "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
- "str q15, [%x[c_ptr], #0x70]\n"
- "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
- "str q23, [%x[c_ptr], #0x80]\n"
- "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
- "str q8, [%x[c_ptr], #0x90]\n"
- "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
- "str q16, [%x[c_ptr], #0xa0]\n"
- "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
- "str q24, [%x[c_ptr], #0xb0]\n"
- "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
- "str q9, [%x[c_ptr], #0xc0]\n"
- "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
- "str q17, [%x[c_ptr], #0xd0]\n"
- "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
- "str q25, [%x[c_ptr], #0xe0]\n"
- "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
- "str q10, [%x[c_ptr], #0xf0]\n"
- "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
- "str q18, [%x[c_ptr], #0x100]\n"
- "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
- "str q26, [%x[c_ptr], #0x110]\n"
- "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
- "str q11, [%x[c_ptr], #0x120]\n"
- "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
- "str q19, [%x[c_ptr], #0x130]\n"
- "b 4f\n" // Complete write out
-
- "3:\n" // Odd tail
- "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
- "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
- "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
- "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
- "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
- "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
- "str q5, [%x[c_ptr]]\n"
- "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
- "str q13, [%x[c_ptr], #0x10]\n"
- "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
- "str q21, [%x[c_ptr], #0x20]\n"
- "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
- "str q6, [%x[c_ptr], #0x30]\n"
- "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
- "str q14, [%x[c_ptr], #0x40]\n"
- "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
- "str q22, [%x[c_ptr], #0x50]\n"
- "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
- "str q7, [%x[c_ptr], #0x60]\n"
- "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
- "str q15, [%x[c_ptr], #0x70]\n"
- "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
- "str q23, [%x[c_ptr], #0x80]\n"
- "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
- "str q8, [%x[c_ptr], #0x90]\n"
- "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
- "str q16, [%x[c_ptr], #0xa0]\n"
- "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
- "str q24, [%x[c_ptr], #0xb0]\n"
- "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
- "str q9, [%x[c_ptr], #0xc0]\n"
- "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
- "str q17, [%x[c_ptr], #0xd0]\n"
- "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
- "str q25, [%x[c_ptr], #0xe0]\n"
- "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
- "str q10, [%x[c_ptr], #0xf0]\n"
- "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
- "str q18, [%x[c_ptr], #0x100]\n"
- "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
- "str q26, [%x[c_ptr], #0x110]\n"
- "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
- "str q11, [%x[c_ptr], #0x120]\n"
-
- "4:\n" // End of function
- "str q19, [%x[c_ptr], #0x130]\n"
- "str q27, [%x[c_ptr], #0x140]\n"
- "str q12, [%x[c_ptr], #0x150]\n"
- "str q20, [%x[c_ptr], #0x160]\n"
- "str q28, [%x[c_ptr], #0x170]\n"
- "add %x[c_ptr], %x[c_ptr], #0x180\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
- [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
- : [odd_k] "r" (odd_k)
- : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
deleted file mode 100644
index 4ac2ba4234..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
- assert(Apanel);
- assert(Bpanel);
- assert(Cpanel);
- const int8_t *a_ptr = Apanel;
- int32_t *c_ptr = Cpanel;
- // We divide K by 4 because the sdot instruction processes 4 elements at a time.
- const int W = K/4;
- // Fix up for odd lengths - set a flag if K is odd, but make.
- // sure we round up the iteration count.
- const int oddk = (W & 1);
- const int init_value_k = ((W+1)/2) - 1;
- for (int yb=0; yb<ablocks; yb++) {
- const int8_t *a_ptr0 = a_ptr;
- const int8_t *b_ptr = Bpanel;
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- int k = init_value_k;
- register int32x4_t a0 asm("v0");
- register int32x4_t a1 asm("v1");
- register int32x4_t b0 asm("v2");
- register int32x4_t b1 asm("v3");
- register int32x4_t b2 asm("v4");
- register int32x4_t a0a asm("v5");
- register int32x4_t a1a asm("v6");
- __asm __volatile (
- _DECLARE_SDOT
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldp %q[a0], %q[a1], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldp %q[b0], %q[b1], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
-
- // Loop proper
- "1:\n"
- "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
- "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "ins %[b2].d[1], x20\n"
- "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
-
- "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "ins %[a0a].d[1], x20\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "ins %[a1a].d[1], x20\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "ins %[b0].d[1], x20\n"
- "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
-
- "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
-
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
- "ins %[b1].d[1], x20\n"
- "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
-
- "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "ins %[b2].d[1], x20\n"
- "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
-
- "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "ins %[a0].d[1], x20\n"
- ASM_PREFETCH("[%[b_ptr], #512]")
- "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
-
- "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "ins %[a1].d[1], x20\n"
- "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
-
- "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "ins %[b0].d[1], x20\n"
- "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-
- "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "subs %w[k], %w[k], #1\n"
- "ins %[b1].d[1], x20\n"
- "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "bne 1b\n"
-
- // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
- "4:\n"
-
- // Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
-
- // Detached final iteration (even K)
- "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
-
- "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "ins %[a0a].d[1], x20\n"
- "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "ins %[a1a].d[1], x20\n"
- "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
-
- "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "ins %[b0].d[1], x20\n"
- "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
- "ins %[b1].d[1], x20\n"
- "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "ins %[b2].d[1], x20\n"
-
- "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "str q8, [%[c_ptr], #0]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-
- "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
-
- // Detached final iteration (odd K)
- "2:\n"
- "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "ldr x20, [%[b_ptr], #40]\n"
-
- "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "ins %[b2].d[1], x20\n"
- "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
-
- // Common tail
- "3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
-
-
-
- ".purgem sdot\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
-
-
- }
- }
-}
-
-#endif
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
deleted file mode 100644
index 1d6fd1623e..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_SDOT ".altmacro\n"\
- ".macro sdot opd:req, opn:req, opm:req\n"\
- "local vd, vn, vm, h, l\n"\
- ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
- ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
- ".set vd,\\reg\n"\
- ".endif\n"\
- ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
- ".set vn,\\reg\n"\
- ".endif\n"\
- ".irp idx,0,1,2,3\n"\
- ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
- ".set vm,\\reg\n"\
- ".set h,\\idx / 2\n"\
- ".set l,\\idx %% 2\n"\
- ".endif\n"\
- ".endr\n"\
- ".endr\n"\
- ".ifndef vd\n"\
- ".error \"Bad operand \\opd\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef vn\n"\
- ".error \"Bad operand \\opn\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef vm\n"\
- ".error \"Bad operand \\opm\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef h\n"\
- ".error \"Bad operand \\opm\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef l\n"\
- ".error \"Bad operand \\opm\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
- ".endm\n"\
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
deleted file mode 100644
index bfad0373b2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-
-inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
- assert(Apanel);
- assert(Bpanel);
- assert(Cpanel);
- K/=4;
- const long int row_jump=0;
- const long int block_jump=0;
- const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel);
- int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel);
- for (int yb=0; yb<ablocks; yb++) {
- const int32_t *a_ptr0 = a_ptr;
- const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel);
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k = ((K+1)/2) - 1;
- register int32x4_t a0 asm("v0");
- register int32x4_t a1 asm("v1");
- register int32x4_t b0 asm("v2");
- register int32x4_t b1 asm("v3");
- register int32x4_t b2 asm("v4");
- register int32x4_t a0a asm("v5");
- register int32x4_t a1a asm("v6");
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- _DECLARE_SDOT
-
- // Loop proper
- "1:\n"
- "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
-
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
- "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr %q[a0], [%[a_ptr], #64]\n"
- "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "ldr %q[a1], [%[a_ptr], #80]\n"
- "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
-
- "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- ASM_PREFETCH("[%[b_ptr], #512]")
- "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
-
- "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "subs %w[k], %w[k], #1\n"
- "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "bne 1b\n"
-
- // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
- "4:\n"
-
- // Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
-
- // Detached final iteration (even K)
- "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-
- "add %[b_ptr], %[b_ptr], %[block_jump]\n"
- "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "str q8, [%[c_ptr], #0]\n"
- "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "str q24, [%[c_ptr], #32]\n"
-
- "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
-
- // Detached final iteration (odd K)
- "2:\n"
- "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
-
- // Common tail
- "3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
-
- ".purgem sdot\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
- }
-
-
-}
-
-
-#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
deleted file mode 100644
index 0ec435b33b..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp
+++ /dev/null
@@ -1,465 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
- const int8_t *a_ptr = Apanel;
- int32_t *c_ptr = Cpanel;
- K /= 16;
- int oddk = (K & 1);
-
- for (int yb=0; yb<ablocks; yb++) {
- const int8_t *a_ptr0 = a_ptr;
- const int8_t *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
-
- int k = ((K+1)/2)-1;
-
- register int8x16_t b0 asm("v4");
- register int8x16_t b1 asm("v5");
- register int8x16_t b2 asm("v6");
- register int8x16_t b3 asm("v7");
- register int8x16_t b0a asm("v8");
- register int8x16_t b1a asm("v9");
- register int8x16_t b2a asm("v10");
- register int8x16_t b3a asm("v11");
-
- __asm __volatile (
- "movi v16.4s, #0x0\n"
- "ldr q0, [%[a_ptr]]\n"
- "movi v17.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v18.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v19.4s, #0x0\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "movi v20.4s, #0x0\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%[a_ptr], #16]\n"
- "movi v22.4s, #0x0\n"
- "ldr q2, [%[a_ptr], #32]\n"
- "movi v23.4s, #0x0\n"
- "ldr q3, [%[a_ptr], #48]\n"
- "movi v24.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v25.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v26.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v27.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v28.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v29.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v30.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v31.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
-
- // Loop structure optimized for A57 (after r0).
-
- // Unavoidably, the multiply will "dribble" if
- // dual issued with an add.
-
- // Minimize the effect of this by making sure
- // there are 2 adds to run under the dribbled
- // multiply.
-
- // Pipeline in blocks of 8 multiplies - combine
- // this iteration's multiplies with adds from
- // the previous iteration.
-
- // So the first block doesn't have any adds to
- // do - but because all the adds are at the
- // start of the block it's only the first couple
- // of multiplies that need to be pulled out.
-
- // Start of unroll 0 (first iteration)
- "smull v12.8h, v0.8b, %[b0].8b\n"
- "smull v13.8h, v0.8b, %[b1].8b\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- // Unroll 0 continuation (branch target)
- "1:\n"
- "smull v14.8h, v0.8b, %[b2].8b\n"
- "subs %w[k], %w[k], #1\n"
- "smull v15.8h, v0.8b, %[b3].8b\n"
- "ldr %q[b0a], [%[b_ptr], #64]\n"
- "smlal2 v12.8h, v0.16b, %[b0].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1].16b\n"
- "ldr %q[b1a], [%[b_ptr], #80]\n"
- "smlal2 v14.8h, v0.16b, %[b2].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3].16b\n"
- "ldr q0, [%[a_ptr], #64]\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2].8b\n"
- "ldr %q[b2a], [%[b_ptr], #96]\n"
- "smull v15.8h, v1.8b, %[b3].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0].16b\n"
- "ldr %q[b3a], [%[b_ptr], #112]\n"
- "smlal2 v13.8h, v1.16b, %[b1].16b\n"
- "add %[b_ptr], %[b_ptr], #128\n"
- "smlal2 v14.8h, v1.16b, %[b2].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3].16b\n"
- "ldr q1, [%[a_ptr], #80]\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "smull v14.8h, v2.8b, %[b2].8b\n"
- "smull v15.8h, v2.8b, %[b3].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0].16b\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "smlal2 v13.8h, v2.16b, %[b1].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2].16b\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "smlal2 v15.8h, v2.16b, %[b3].16b\n"
- "ldr q2, [%[a_ptr], #96]\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "smull v14.8h, v3.8b, %[b2].8b\n"
- "smull v15.8h, v3.8b, %[b3].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0].16b\n"
- "ldr %q[b0], [%[b_ptr], #0]\n"
- "smlal2 v13.8h, v3.16b, %[b1].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2].16b\n"
- "smlal2 v15.8h, v3.16b, %[b3].16b\n"
- "ldr q3, [%[a_ptr], #112]\n"
-
- // Unroll 1
- "sadalp v28.4s, v12.8h\n"
- "smull v12.8h, v0.8b, %[b0a].8b\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "smull v13.8h, v0.8b, %[b1a].8b\n"
- "sadalp v31.4s, v15.8h\n"
- "smull v14.8h, v0.8b, %[b2a].8b\n"
- "smull v15.8h, v0.8b, %[b3a].8b\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
- "ldr q0, [%[a_ptr], #128]\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0a].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1a].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "add %[a_ptr], %[a_ptr], #128\n"
- "smull v14.8h, v1.8b, %[b2a].8b\n"
- "smull v15.8h, v1.8b, %[b3a].8b\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
- "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
- "ldr q1, [%[a_ptr], #16]\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0a].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1a].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "smull v14.8h, v2.8b, %[b2a].8b\n"
- "smull v15.8h, v2.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
- "ldr q2, [%[a_ptr], #32]\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0a].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1a].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "smull v14.8h, v3.8b, %[b2a].8b\n"
- "smull v15.8h, v3.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
- "ldr q3, [%[a_ptr], #48]\n"
-
- // Start of unroll 0 for next iteration.
- "sadalp v28.4s, v12.8h\n"
- "smull v12.8h, v0.8b, %[b0].8b\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "smull v13.8h, v0.8b, %[b1].8b\n"
- "sadalp v31.4s, v15.8h\n"
- "bne 1b\n"
-
- // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
- "4:\n"
-
- // Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
-
- // Detached final iteration (even K)
- "smull v14.8h, v0.8b, %[b2].8b\n"
- "smull v15.8h, v0.8b, %[b3].8b\n"
- "ldr %q[b0a], [%[b_ptr], #64]\n"
- "smlal2 v12.8h, v0.16b, %[b0].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1].16b\n"
- "ldr %q[b1a], [%[b_ptr], #80]\n"
- "smlal2 v14.8h, v0.16b, %[b2].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3].16b\n"
- "ldr q0, [%[a_ptr], #64]\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2].8b\n"
- "ldr %q[b2a], [%[b_ptr], #96]\n"
- "smull v15.8h, v1.8b, %[b3].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0].16b\n"
- "ldr %q[b3a], [%[b_ptr], #112]\n"
- "smlal2 v13.8h, v1.16b, %[b1].16b\n"
- "add %[b_ptr], %[b_ptr], #128\n"
- "smlal2 v14.8h, v1.16b, %[b2].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3].16b\n"
- "ldr q1, [%[a_ptr], #80]\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "smull v14.8h, v2.8b, %[b2].8b\n"
- "smull v15.8h, v2.8b, %[b3].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0].16b\n"
- "smlal2 v13.8h, v2.16b, %[b1].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2].16b\n"
- "smlal2 v15.8h, v2.16b, %[b3].16b\n"
- "ldr q2, [%[a_ptr], #96]\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "smull v14.8h, v3.8b, %[b2].8b\n"
- "smull v15.8h, v3.8b, %[b3].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0].16b\n"
- "smlal2 v13.8h, v3.16b, %[b1].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2].16b\n"
- "smlal2 v15.8h, v3.16b, %[b3].16b\n"
- "ldr q3, [%[a_ptr], #112]\n"
-
- // Unroll 1
- "sadalp v28.4s, v12.8h\n"
- "smull v12.8h, v0.8b, %[b0a].8b\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "smull v13.8h, v0.8b, %[b1a].8b\n"
- "sadalp v31.4s, v15.8h\n"
- "smull v14.8h, v0.8b, %[b2a].8b\n"
- "add %[a_ptr], %[a_ptr], #128\n"
- "smull v15.8h, v0.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0a].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1a].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2a].8b\n"
- "smull v15.8h, v1.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0a].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1a].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smull v14.8h, v2.8b, %[b2a].8b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
- "smull v15.8h, v2.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
- "str q16, [%[c_ptr]]\n"
- "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
- "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0a].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1a].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smull v14.8h, v3.8b, %[b2a].8b\n"
- "addp v20.4s, v24.4s, v25.4s\n"
- "addp v21.4s, v26.4s, v27.4s\n"
- "smull v15.8h, v3.8b, %[b3a].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
- "str q17, [%[c_ptr], #16]\n"
- "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
- "b 3f\n"
-
- // Detached final iteration (odd K)
- "2:\n"
- "smull v14.8h, v0.8b, %[b2].8b\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "smull v15.8h, v0.8b, %[b3].8b\n"
- "add %[b_ptr], %[b_ptr], #64\n"
- "smlal2 v12.8h, v0.16b, %[b0].16b\n"
- "smlal2 v13.8h, v0.16b, %[b1].16b\n"
- "smlal2 v14.8h, v0.16b, %[b2].16b\n"
- "smlal2 v15.8h, v0.16b, %[b3].16b\n"
-
- "sadalp v16.4s, v12.8h\n"
- "smull v12.8h, v1.8b, %[b0].8b\n"
- "sadalp v17.4s, v13.8h\n"
- "sadalp v18.4s, v14.8h\n"
- "smull v13.8h, v1.8b, %[b1].8b\n"
- "sadalp v19.4s, v15.8h\n"
- "smull v14.8h, v1.8b, %[b2].8b\n"
- "smull v15.8h, v1.8b, %[b3].8b\n"
- "smlal2 v12.8h, v1.16b, %[b0].16b\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smlal2 v13.8h, v1.16b, %[b1].16b\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smlal2 v14.8h, v1.16b, %[b2].16b\n"
- "smlal2 v15.8h, v1.16b, %[b3].16b\n"
-
- "sadalp v20.4s, v12.8h\n"
- "smull v12.8h, v2.8b, %[b0].8b\n"
- "sadalp v21.4s, v13.8h\n"
- "sadalp v22.4s, v14.8h\n"
- "smull v13.8h, v2.8b, %[b1].8b\n"
- "sadalp v23.4s, v15.8h\n"
- "addp v16.4s, v16.4s, v17.4s\n"
- "smull v14.8h, v2.8b, %[b2].8b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
- "smull v15.8h, v2.8b, %[b3].8b\n"
- "smlal2 v12.8h, v2.16b, %[b0].16b\n"
- "str q16, [%[c_ptr]]\n"
- "smlal2 v13.8h, v2.16b, %[b1].16b\n"
- "smlal2 v14.8h, v2.16b, %[b2].16b\n"
- "smlal2 v15.8h, v2.16b, %[b3].16b\n"
-
- "sadalp v24.4s, v12.8h\n"
- "smull v12.8h, v3.8b, %[b0].8b\n"
- "sadalp v25.4s, v13.8h\n"
- "sadalp v26.4s, v14.8h\n"
- "smull v13.8h, v3.8b, %[b1].8b\n"
- "sadalp v27.4s, v15.8h\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "smull v14.8h, v3.8b, %[b2].8b\n"
- "addp v20.4s, v24.4s, v25.4s\n"
- "addp v21.4s, v26.4s, v27.4s\n"
- "smull v15.8h, v3.8b, %[b3].8b\n"
- "smlal2 v12.8h, v3.16b, %[b0].16b\n"
- "str q17, [%[c_ptr], #16]\n"
- "smlal2 v13.8h, v3.16b, %[b1].16b\n"
- "smlal2 v14.8h, v3.16b, %[b2].16b\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "smlal2 v15.8h, v3.16b, %[b3].16b\n"
-
- "3:\n"
-
- // Final additions
- "sadalp v28.4s, v12.8h\n"
- "str q18, [%[c_ptr], #32]\n"
- "sadalp v29.4s, v13.8h\n"
- "sadalp v30.4s, v14.8h\n"
- "sadalp v31.4s, v15.8h\n"
-
- // Horizontal reduction, phase 1
- "addp v22.4s, v28.4s, v29.4s\n"
- "addp v23.4s, v30.4s, v31.4s\n"
-
- // Horizontal reduction, phase 2
- "addp v19.4s, v22.4s, v23.4s\n"
- "str q19, [%[c_ptr], #48]\n"
- "add %[c_ptr], %[c_ptr], #64\n"
-
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
- [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a),
- [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
- "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
- }
- }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
deleted file mode 100644
index b3f310ce62..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <arm_neon.h>
-
-inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
-{
- const uint16_t *a_ptr = Apanel;
- uint32_t *c_ptr = Cpanel;
-
- for (int yb = 0; yb < ablocks; yb++)
- {
- const uint16_t *a_ptr0 = a_ptr;
- const uint16_t *b_ptr = Bpanel;
-
- for (int xb = 0; xb < bblocks; xb++)
- {
- a_ptr = a_ptr0;
- const bool odd_k = K & 0x1;
- int k = (K+1)/2 - 1;
-
- register uint16x8_t aa asm("v0");
- register uint16x8_t ab asm("v1");
- register uint16x8_t b0 asm("v2");
- register uint16x8_t b1 asm("v3");
- register uint16x8_t b2 asm("v4");
-
- __asm __volatile (
- "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
- "movi v5.4s, #0\n"
- "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
- "movi v6.4s, #0\n"
- "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
- "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper
- "movi v7.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v8.4s, #0\n"
- "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
- "movi v9.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v10.4s, #0\n"
- "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
- "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper
- "movi v11.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #96]")
- "movi v12.4s, #0\n"
- "movi v13.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #96]")
- "movi v14.4s, #0\n"
- "movi v15.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0\n"
- "movi v17.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v18.4s, #0\n"
- "movi v19.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #160]")
- "movi v20.4s, #0\n"
- "movi v21.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #160]")
- "movi v22.4s, #0\n"
- "movi v23.4s, #0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v24.4s, #0\n"
- "add %x[a_ptr], %x[a_ptr], #0x10\n"
- "movi v25.4s, #0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v26.4s, #0\n"
- "add %x[b_ptr], %x[b_ptr], #0x18\n"
- "movi v27.4s, #0\n"
- "movi v28.4s, #0\n"
-
- "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
-
- "1:\n" // Main loop
- // First unroll
- "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
- "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
- "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
- "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
- "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
- "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
- "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
- "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
- "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
- "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
- "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
- "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
- "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
- "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
- "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
- "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
- "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
- "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
- "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
- "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
- "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
- "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
- "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
- "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
- "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
- "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
- "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
- "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
- "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
- "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
- "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
- "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
- // Second unroll
- "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
- "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
- "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper
- "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
- "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
- "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
- "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
- "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
- "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
- "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
- "add %x[a_ptr], %x[a_ptr], #0x20\n"
- "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
- "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
- "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
- "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
- "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
- "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
- "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
- "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
- "subs %x[k], %x[k], #0x1\n"
- "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
- "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
- "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
- "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper
- "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
- "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
- "add %x[b_ptr], %x[b_ptr], #0x30\n"
- "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
- "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
- "bne 1b\n"
-
- "2:\n" // Even tail
- "cbnz %x[odd_k], 3f\n"
-
- "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
- "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
- "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
- "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
- "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
- "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
- "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
- "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
- "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
- "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
- "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
- "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
- "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
- "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
- "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
- "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
- "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
- "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
- "add %[a_ptr], %[a_ptr], #0x10\n"
- "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
- "add %[b_ptr], %[b_ptr], #0x18\n"
- "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
- "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
- "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
- "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
- "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
- "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
- "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
- "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
- "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
- "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
- "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
- "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
-
- "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
- "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
- "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
- "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
- "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
- "str q5, [%x[c_ptr]]\n"
- "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
- "str q13, [%x[c_ptr], #0x10]\n"
- "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
- "str q21, [%x[c_ptr], #0x20]\n"
- "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
- "str q6, [%x[c_ptr], #0x30]\n"
- "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
- "str q14, [%x[c_ptr], #0x40]\n"
- "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
- "str q22, [%x[c_ptr], #0x50]\n"
- "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
- "str q7, [%x[c_ptr], #0x60]\n"
- "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
- "str q15, [%x[c_ptr], #0x70]\n"
- "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
- "str q23, [%x[c_ptr], #0x80]\n"
- "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
- "str q8, [%x[c_ptr], #0x90]\n"
- "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
- "str q16, [%x[c_ptr], #0xa0]\n"
- "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
- "str q24, [%x[c_ptr], #0xb0]\n"
- "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
- "str q9, [%x[c_ptr], #0xc0]\n"
- "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
- "str q17, [%x[c_ptr], #0xd0]\n"
- "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
- "str q25, [%x[c_ptr], #0xe0]\n"
- "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
- "str q10, [%x[c_ptr], #0xf0]\n"
- "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
- "str q18, [%x[c_ptr], #0x100]\n"
- "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
- "str q26, [%x[c_ptr], #0x110]\n"
- "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
- "str q11, [%x[c_ptr], #0x120]\n"
- "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
- "str q19, [%x[c_ptr], #0x130]\n"
- "b 4f\n" // Complete write out
-
- "3:\n" // Odd tail
- "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
- "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
- "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
- "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
- "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
- "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
- "str q5, [%x[c_ptr]]\n"
- "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
- "str q13, [%x[c_ptr], #0x10]\n"
- "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
- "str q21, [%x[c_ptr], #0x20]\n"
- "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
- "str q6, [%x[c_ptr], #0x30]\n"
- "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
- "str q14, [%x[c_ptr], #0x40]\n"
- "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
- "str q22, [%x[c_ptr], #0x50]\n"
- "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
- "str q7, [%x[c_ptr], #0x60]\n"
- "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
- "str q15, [%x[c_ptr], #0x70]\n"
- "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
- "str q23, [%x[c_ptr], #0x80]\n"
- "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
- "str q8, [%x[c_ptr], #0x90]\n"
- "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
- "str q16, [%x[c_ptr], #0xa0]\n"
- "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
- "str q24, [%x[c_ptr], #0xb0]\n"
- "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
- "str q9, [%x[c_ptr], #0xc0]\n"
- "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
- "str q17, [%x[c_ptr], #0xd0]\n"
- "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
- "str q25, [%x[c_ptr], #0xe0]\n"
- "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
- "str q10, [%x[c_ptr], #0xf0]\n"
- "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
- "str q18, [%x[c_ptr], #0x100]\n"
- "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
- "str q26, [%x[c_ptr], #0x110]\n"
- "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
- "str q11, [%x[c_ptr], #0x120]\n"
-
- "4:\n" // End of function
- "str q19, [%x[c_ptr], #0x130]\n"
- "str q27, [%x[c_ptr], #0x140]\n"
- "str q12, [%x[c_ptr], #0x150]\n"
- "str q20, [%x[c_ptr], #0x160]\n"
- "str q28, [%x[c_ptr], #0x170]\n"
- "add %x[c_ptr], %x[c_ptr], #0x180\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k),
- [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2)
- : [odd_k] "r" (odd_k)
- : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
deleted file mode 100644
index c7c2acbb49..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
- assert(Apanel);
- assert(Bpanel);
- assert(Cpanel);
- const uint8_t *a_ptr = Apanel;
- uint32_t *c_ptr = Cpanel;
- // We divide K by 4 because the udot instruction processes 4 elements at a time.
- const int W = K/4;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- const int oddk = (W & 1);
- const int init_value_k = ((W+1)/2) - 1;
- for (int yb=0; yb<ablocks; yb++) {
- const uint8_t *a_ptr0 = a_ptr;
- const uint8_t *b_ptr = Bpanel;
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- int k = init_value_k;
- register int32x4_t a0 asm("v0");
- register int32x4_t a1 asm("v1");
- register int32x4_t b0 asm("v2");
- register int32x4_t b1 asm("v3");
- register int32x4_t b2 asm("v4");
- register int32x4_t a0a asm("v5");
- register int32x4_t a1a asm("v6");
- __asm __volatile (
- _DECLARE_UDOT
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldp %q[a0], %q[a1], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldp %q[b0], %q[b1], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
-
- // Loop proper
- "1:\n"
- "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
- "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "ins %[b2].d[1], x20\n"
- "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
-
- "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "ins %[a0a].d[1], x20\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "ins %[a1a].d[1], x20\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "ins %[b0].d[1], x20\n"
- "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
-
- "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
-
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
- "ins %[b1].d[1], x20\n"
- "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
-
- "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "ins %[b2].d[1], x20\n"
- "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
-
- "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "ins %[a0].d[1], x20\n"
- ASM_PREFETCH("[%[b_ptr], #512]")
- "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
-
- "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "ins %[a1].d[1], x20\n"
- "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
-
- "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "ins %[b0].d[1], x20\n"
- "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
-
- "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "subs %w[k], %w[k], #1\n"
- "ins %[b1].d[1], x20\n"
- "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "bne 1b\n"
-
- // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
- "4:\n"
-
- // Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
- // Detached final iteration (even K)
- "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "ins %[b2].d[1], x20\n"
-
- "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
- "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "ins %[a0a].d[1], x20\n"
- "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "ins %[a1a].d[1], x20\n"
- "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
-
- "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "ins %[b0].d[1], x20\n"
- "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
- "ins %[b1].d[1], x20\n"
- "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "ins %[b2].d[1], x20\n"
-
- "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "str q8, [%[c_ptr], #0]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
-
- "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
-
- // Detached final iteration (odd K)
- "2:\n"
- "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "ldr x20, [%[b_ptr], #40]\n"
-
- "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "ins %[b2].d[1], x20\n"
- "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
-
- // Common tail
- "3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
-
-
-
- ".purgem udot\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- }
- }
-}
-#endif
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
deleted file mode 100644
index 718232fb05..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
-#define _DECLARE_UDOT ".altmacro\n"\
- ".macro udot opd:req, opn:req, opm:req\n"\
- "local vd, vn, vm, h, l\n"\
- ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
- ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
- ".set vd,\\reg\n"\
- ".endif\n"\
- ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
- ".set vn,\\reg\n"\
- ".endif\n"\
- ".irp idx,0,1,2,3\n"\
- ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
- ".set vm,\\reg\n"\
- ".set h,\\idx / 2\n"\
- ".set l,\\idx %% 2\n"\
- ".endif\n"\
- ".endr\n"\
- ".endr\n"\
- ".ifndef vd\n"\
- ".error \"Bad operand \\opd\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef vn\n"\
- ".error \"Bad operand \\opn\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef vm\n"\
- ".error \"Bad operand \\opm\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef h\n"\
- ".error \"Bad operand \\opm\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".ifndef l\n"\
- ".error \"Bad operand \\opm\"\n"\
- ".exitm\n"\
- ".endif\n"\
- ".int 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
- ".endm\n"\
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
deleted file mode 100644
index 3531eb6d25..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "dot_toolchain_support.h"
-#include <cassert>
-
-inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
- assert(Apanel);
- assert(Bpanel);
- assert(Cpanel);
- const uint8_t *a_ptr = Apanel;
- uint32_t *c_ptr = Cpanel;
- // We divide K by 4 because the udot instruction processes 4 elements at a time.
- const int W = K/4;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- const int oddk = (W & 1);
- const int init_value_k = ((W+1)/2) - 1;
- for (int yb=0; yb<ablocks; yb++) {
- const uint8_t *a_ptr0 = a_ptr;
- const uint8_t *b_ptr = Bpanel;
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- int k = init_value_k;
- register uint32x4_t a0 asm("v0");
- register uint32x4_t a1 asm("v1");
- register uint32x4_t b0 asm("v2");
- register uint32x4_t b1 asm("v3");
- register uint32x4_t b2 asm("v4");
- register uint32x4_t a0a asm("v5");
- register uint32x4_t a1a asm("v6");
- __asm __volatile (
- _DECLARE_UDOT
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- // Loop proper
- "1:\n"
- "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
-
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
- "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "ldr %q[a0], [%[a_ptr], #64]\n"
- "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "ldr %q[a1], [%[a_ptr], #80]\n"
- "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
-
- "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- ASM_PREFETCH("[%[b_ptr], #512]")
- "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
-
- "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "subs %w[k], %w[k], #1\n"
- "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "bne 1b\n"
-
- // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
- "4:\n"
-
- // Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
-
- // Detached final iteration (even K)
- "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
-
- "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
- "str q24, [%[c_ptr], #32]\n"
-
- "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
-
- // Detached final iteration (odd K)
- "2:\n"
- "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
- "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
-
- // Common tail
- "3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
-
- ".purgem udot\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
- );
-
- }
- }
-
-
-}
-#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
deleted file mode 100644
index aff3faf666..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
- const uint8_t *a_ptr = Apanel;
- uint32_t *c_ptr = Cpanel;
- K /= 16;
-
- for (int yb=0; yb<ablocks; yb++) {
- const uint8_t *a_ptr0 = a_ptr;
- const uint8_t *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
-
- int k = K-1;
-
- register uint8x16_t b0 asm("v4");
- register uint8x16_t b1 asm("v5");
- register uint8x16_t b2 asm("v6");
- register uint8x16_t b3 asm("v7");
-
- __asm __volatile (
- "movi v16.4s, #0x0\n"
- "ldr q0, [%[a_ptr]]\n"
- "movi v17.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v18.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v19.4s, #0x0\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "movi v20.4s, #0x0\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
- "movi v21.4s, #0x0\n"
- "ldr q1, [%[a_ptr], #16]\n"
- "movi v22.4s, #0x0\n"
- "ldr q2, [%[a_ptr], #32]\n"
- "movi v23.4s, #0x0\n"
- "ldr q3, [%[a_ptr], #48]\n"
- "movi v24.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v25.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v26.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v27.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v28.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v29.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v30.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v31.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
-
- "umull v12.8h, v0.8b, %[b0].8b\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "umull v13.8h, v0.8b, %[b1].8b\n"
- "umull v14.8h, v0.8b, %[b2].8b\n"
- "add %[b_ptr], %[b_ptr], #64\n"
- "umull v15.8h, v0.8b, %[b3].8b\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 2f\n"
-
- "1:\n"
- "uadalp v16.4s, v12.8h\n"
- "umull2 v12.8h, v0.16b, %[b0].16b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull2 v13.8h, v0.16b, %[b1].16b\n"
- "uadalp v18.4s, v14.8h\n"
- "umull2 v14.8h, v0.16b, %[b2].16b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull2 v15.8h, v0.16b, %[b3].16b\n"
- "ldr q0, [%[a_ptr]]\n"
-
- "uadalp v16.4s, v12.8h\n"
- "umull v12.8h, v1.8b, %[b0].8b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull v13.8h, v1.8b, %[b1].8b\n"
- "subs %w[k], %w[k], #1\n"
- "uadalp v18.4s, v14.8h\n"
- "umull v14.8h, v1.8b, %[b2].8b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull v15.8h, v1.8b, %[b3].8b\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull2 v12.8h, v1.16b, %[b0].16b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull2 v13.8h, v1.16b, %[b1].16b\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "uadalp v22.4s, v14.8h\n"
- "umull2 v14.8h, v1.16b, %[b2].16b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull2 v15.8h, v1.16b, %[b3].16b\n"
- "ldr q1, [%[a_ptr], #16]\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull v12.8h, v2.8b, %[b0].8b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull v13.8h, v2.8b, %[b1].8b\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "uadalp v22.4s, v14.8h\n"
- "umull v14.8h, v2.8b, %[b2].8b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull v15.8h, v2.8b, %[b3].8b\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull2 v12.8h, v2.16b, %[b0].16b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull2 v13.8h, v2.16b, %[b1].16b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull2 v14.8h, v2.16b, %[b2].16b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull2 v15.8h, v2.16b, %[b3].16b\n"
- "ldr q2, [%[a_ptr], #32]\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull v12.8h, v3.8b, %[b0].8b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull v13.8h, v3.8b, %[b1].8b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull v14.8h, v3.8b, %[b2].8b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull v15.8h, v3.8b, %[b3].8b\n"
-
- "uadalp v28.4s, v12.8h\n"
- "umull2 v12.8h, v3.16b, %[b0].16b\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "uadalp v29.4s, v13.8h\n"
- "umull2 v13.8h, v3.16b, %[b1].16b\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "uadalp v30.4s, v14.8h\n"
- "umull2 v14.8h, v3.16b, %[b2].16b\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "uadalp v31.4s, v15.8h\n"
- "umull2 v15.8h, v3.16b, %[b3].16b\n"
- "ldr %q[b3], [%[b_ptr], #48]\n"
-
- "uadalp v28.4s, v12.8h\n"
- "umull v12.8h, v0.8b, %[b0].8b\n"
- "add %[b_ptr], %[b_ptr], #64\n"
- "uadalp v29.4s, v13.8h\n"
- "umull v13.8h, v0.8b, %[b1].8b\n"
- "ldr q3, [%[a_ptr], #48]\n"
- "uadalp v30.4s, v14.8h\n"
- "umull v14.8h, v0.8b, %[b2].8b\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "uadalp v31.4s, v15.8h\n"
- "umull v15.8h, v0.8b, %[b3].8b\n"
- "bne 1b\n"
-
- // Branch target
- "2:\n"
- "uadalp v16.4s, v12.8h\n"
- "umull2 v12.8h, v0.16b, %[b0].16b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull2 v13.8h, v0.16b, %[b1].16b\n"
- "uadalp v18.4s, v14.8h\n"
- "umull2 v14.8h, v0.16b, %[b2].16b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull2 v15.8h, v0.16b, %[b3].16b\n"
-
- "uadalp v16.4s, v12.8h\n"
- "umull v12.8h, v1.8b, %[b0].8b\n"
- "uadalp v17.4s, v13.8h\n"
- "umull v13.8h, v1.8b, %[b1].8b\n"
- "uadalp v18.4s, v14.8h\n"
- "umull v14.8h, v1.8b, %[b2].8b\n"
- "uadalp v19.4s, v15.8h\n"
- "umull v15.8h, v1.8b, %[b3].8b\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull2 v12.8h, v1.16b, %[b0].16b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull2 v13.8h, v1.16b, %[b1].16b\n"
- "uadalp v22.4s, v14.8h\n"
- "umull2 v14.8h, v1.16b, %[b2].16b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull2 v15.8h, v1.16b, %[b3].16b\n"
-
- "uadalp v20.4s, v12.8h\n"
- "umull v12.8h, v2.8b, %[b0].8b\n"
- "uadalp v21.4s, v13.8h\n"
- "umull v13.8h, v2.8b, %[b1].8b\n"
- "uadalp v22.4s, v14.8h\n"
- "umull v14.8h, v2.8b, %[b2].8b\n"
- "uadalp v23.4s, v15.8h\n"
- "umull v15.8h, v2.8b, %[b3].8b\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull2 v12.8h, v2.16b, %[b0].16b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull2 v13.8h, v2.16b, %[b1].16b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull2 v14.8h, v2.16b, %[b2].16b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull2 v15.8h, v2.16b, %[b3].16b\n"
-
- "uadalp v24.4s, v12.8h\n"
- "umull v12.8h, v3.8b, %[b0].8b\n"
- "uadalp v25.4s, v13.8h\n"
- "umull v13.8h, v3.8b, %[b1].8b\n"
- "uadalp v26.4s, v14.8h\n"
- "umull v14.8h, v3.8b, %[b2].8b\n"
- "uadalp v27.4s, v15.8h\n"
- "umull v15.8h, v3.8b, %[b3].8b\n"
-
- "uadalp v28.4s, v12.8h\n"
- "umull2 v12.8h, v3.16b, %[b0].16b\n"
- "uadalp v29.4s, v13.8h\n"
- "umull2 v13.8h, v3.16b, %[b1].16b\n"
- "uadalp v30.4s, v14.8h\n"
- "umull2 v14.8h, v3.16b, %[b2].16b\n"
- "uadalp v31.4s, v15.8h\n"
- "umull2 v15.8h, v3.16b, %[b3].16b\n"
-
- "uadalp v28.4s, v12.8h\n"
- "uadalp v29.4s, v13.8h\n"
- "uadalp v30.4s, v14.8h\n"
- "uadalp v31.4s, v15.8h\n"
-
- "addp v16.4s, v16.4s, v17.4s\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
- "addp v20.4s, v24.4s, v25.4s\n"
- "addp v21.4s, v26.4s, v27.4s\n"
- "addp v22.4s, v28.4s, v29.4s\n"
- "addp v23.4s, v30.4s, v31.4s\n"
-
- "addp v16.4s, v16.4s, v17.4s\n"
- "addp v17.4s, v18.4s, v19.4s\n"
- "addp v18.4s, v20.4s, v21.4s\n"
- "addp v19.4s, v22.4s, v23.4s\n"
-
- "str q16, [%[c_ptr]]\n"
- "str q17, [%[c_ptr], #16]\n"
- "str q18, [%[c_ptr], #32]\n"
- "str q19, [%[c_ptr], #48]\n"
- "add %[c_ptr], %[c_ptr], #64\n"
-
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3),
- [k] "+r" (k)
- :
- : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19",
- "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc");
- }
- }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp
deleted file mode 100644
index 1789abb046..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/a55r1.hpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/*
- * Copyright (c) 201 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
- const __fp16 *a_ptr = Apanel;
- __fp16 *c_ptr = Cpanel;
-
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k_iters = ((K+1)/2) - 1;
-
- for (int yb=0; yb<ablocks; yb++) {
- const __fp16 *a_ptr0 = a_ptr;
- const __fp16 *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- int k = k_iters;
- a_ptr = a_ptr0;
-
- // As A55 requires 64-bit loads anyway, just use 64 bits of the
- // "A" operands to save on "ins" instructions. Since A55 is
- // in-order, two sets of "A" operands and one set of "B" is
- // sufficient.
- register float16x8_t a0 asm("v0");
- register float16x8_t a1 asm("v1");
- register float16x8_t a0a asm("v2");
- register float16x8_t a1a asm("v3");
- register float16x8_t b0 asm("v4");
- register float16x8_t b1 asm("v5");
- register float16x8_t b2 asm("v6");
-
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.8h, #0x0\n"
- "ldr %d[a0], [%[a_ptr]]\n"
- "movi v9.8h, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.8h, #0x0\n"
- "ldr %d[a1], [%[a_ptr], #8]\n"
- "movi v11.8h, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.8h, #0x0\n"
- "movi v13.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v14.8h, #0x0\n"
- "movi v15.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v16.8h, #0x0\n"
- "movi v17.8h, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v18.8h, #0x0\n"
- "movi v19.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v20.8h, #0x0\n"
- "movi v21.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v22.8h, #0x0\n"
- "movi v23.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v24.8h, #0x0\n"
- "movi v25.8h, #0x0\n"
- "movi v26.8h, #0x0\n"
- "movi v27.8h, #0x0\n"
- "movi v28.8h, #0x0\n"
- "movi v29.8h, #0x0\n"
- "movi v30.8h, #0x0\n"
- "movi v31.8h, #0x0\n"
-
- // The loop is offset by these two instructions which must
- // always be executed.
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- "1:\n"
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %d[a0a], [%[a_ptr], #16]\n"
-
- "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
- "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
- "ldr %d[a1a], [%[a_ptr], #24]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
-
- "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
- "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
- "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- // Unroll 1
- "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
- "ldr %d[a0], [%[a_ptr], #32]\n"
-
- "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
- "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
- "ldr %d[a1], [%[a_ptr], #40]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
- "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
-
- "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
-
- "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "bne 1b\n"
-
- "4:\n"
-
- // Start final iteration - branch off to "odd" code before we load a0a
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "cbnz %w[oddk], 2f\n"
-
- // Even K continuation
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %d[a0a], [%[a_ptr], #16]\n"
-
- "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr]]")
- "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
- "ldr %d[a1a], [%[a_ptr], #24]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr], #64]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- ASM_PREFETCHW("[%[c_ptr], #128]")
-
- "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
- "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr], #192]")
- "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
- "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
- ASM_PREFETCHW("[%[c_ptr], #256]")
-
- "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr], #320]")
- "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
- "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
- "ldr %d[a1], [%[a_ptr], #40]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #448]")
-
- "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #512]")
- "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #576]")
-
- "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #640]")
- "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #704]")
-
- "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
- "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
- "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
- "b 3f\n"
-
- "2:\n"
-
- // Odd tail
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- ASM_PREFETCHW("[%[c_ptr]]")
-
- "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr], #64]")
- "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
- "add %[a_ptr], %[a_ptr], #16\n"
- "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
- ASM_PREFETCHW("[%[c_ptr], #128]")
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr], #192]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- ASM_PREFETCHW("[%[c_ptr], #256]")
-
- "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
- "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
- ASM_PREFETCHW("[%[c_ptr], #320]")
- "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
- "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #384]")
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #384]")
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #448]")
-
- "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #512]")
- "fmla v29.8h, %[b2].8h, %[a1].h[1]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #576]")
- "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #640]")
- "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
- ASM_PREFETCHWL2("[%[c_ptr], #704]")
-
- // Common tail
- // A55 won't dual issue these stores with anything else, so
- // simplest to do them all in this common code.
- "3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "5:\n"
- "add %[c_ptr], %[c_ptr], #384\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "=w" (a0), [a0a] "=w" (a0a), [a1] "=w" (a1), [a1a] "=w" (a1a),
- [b0] "=w" (b0), [b1] "=w" (b1), [b2] "=w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
deleted file mode 100644
index 03e2bb95a3..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
- const __fp16 *a_ptr = Apanel;
- __fp16 *c_ptr = Cpanel;
- for (int yb=0; yb<ablocks; yb++) {
- const __fp16 *a_ptr0 = a_ptr;
- const __fp16 *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k = ((K+1)/2) - 1;
- register float16x8_t a0 asm("v0");
- register float16x8_t a0a asm("v1");
- register float16x8_t b0 asm("v2");
- register float16x8_t b1 asm("v3");
- register float16x8_t b2 asm("v4");
- register float16x8_t b0a asm("v5");
- register float16x8_t b1a asm("v6");
- register float16x8_t b2a asm("v7");
-
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.8h, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.8h, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.8h, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v11.8h, #0x0\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "movi v12.8h, #0x0\n"
- "ldr %q[b0a], [%[b_ptr], #48]\n"
- "movi v13.8h, #0x0\n"
- "ldr %q[b1a], [%[b_ptr], #64]\n"
- "movi v14.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v15.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v16.8h, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v17.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v18.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v19.8h, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.8h, #0x0\n"
- "movi v21.8h, #0x0\n"
- "movi v22.8h, #0x0\n"
- "movi v23.8h, #0x0\n"
- "movi v24.8h, #0x0\n"
- "movi v25.8h, #0x0\n"
- "movi v26.8h, #0x0\n"
- "movi v27.8h, #0x0\n"
- "movi v28.8h, #0x0\n"
- "movi v29.8h, #0x0\n"
- "movi v30.8h, #0x0\n"
- "movi v31.8h, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- "1:\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr %q[a0a], [%[a_ptr], #16]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %q[b2a], [%[b_ptr], #80]\n"
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- ASM_PREFETCH("[%[b_ptr], #288]")
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
- "ldr %q[a0], [%[a_ptr], #32]\n"
-
- "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
- "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
- "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
- "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
- "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
- "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
- "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
- "ldr %q[b0a], [%[b_ptr], #48]\n"
-
- "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
- "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
- ASM_PREFETCH("[%[b_ptr], #352]")
- "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
- "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
- "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
- "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
- "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
- "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
- "ldr %q[b1a], [%[b_ptr], #64]\n"
-
- "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
- "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
- "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
- "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
- "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
- "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
-
- "bne 1b\n"
- "4:\n"
-
- // Jump to odd tail if necessary.
- "cbnz %w[oddk], 2f\n"
-
- // Even tail.
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "ldr %q[a0a], [%[a_ptr], #16]\n"
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "ldr %q[b2a], [%[b_ptr], #80]\n"
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
-
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
-
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
-
- "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
- "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
- "str q8, [%[c_ptr]]\n"
- "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
- "str q16, [%[c_ptr], #16]\n"
-
- "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
- "str q17, [%[c_ptr], #64]\n"
-
- "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
- "str q18, [%[c_ptr], #112]\n"
-
- "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
- "str q19, [%[c_ptr], #160]\n"
-
- "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
- "str q12, [%[c_ptr], #192]\n"
- "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
- "str q20, [%[c_ptr], #208]\n"
-
- "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
- "str q13, [%[c_ptr], #240]\n"
- "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
- "str q21, [%[c_ptr], #256]\n"
-
- "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
- "str q14, [%[c_ptr], #288]\n"
- "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
- "str q22, [%[c_ptr], #304]\n"
-
- "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
- "str q15, [%[c_ptr], #336]\n"
- "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
- "b 3f\n"
-
- // Odd tail
- "2:\n"
- "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
- "add %[a_ptr], %[a_ptr], #16\n"
- "str q8, [%[c_ptr]]\n"
- "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
- "str q16, [%[c_ptr], #16]\n"
-
- "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
- "str q17, [%[c_ptr], #64]\n"
-
- "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
- "str q10, [%[c_ptr], #96]\n"
- "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
- "str q18, [%[c_ptr], #112]\n"
-
- "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
- "str q11, [%[c_ptr], #144]\n"
- "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
- "str q19, [%[c_ptr], #160]\n"
-
- "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
- "str q12, [%[c_ptr], #192]\n"
- "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
- "str q20, [%[c_ptr], #208]\n"
-
- "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
- "str q13, [%[c_ptr], #240]\n"
- "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
- "str q21, [%[c_ptr], #256]\n"
-
- "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
- "str q14, [%[c_ptr], #288]\n"
- "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
- "str q22, [%[c_ptr], #304]\n"
-
- "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
- "str q15, [%[c_ptr], #336]\n"
- "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
-
- "3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a0a] "+w" (a0a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k),
- [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
deleted file mode 100644
index 1c9b4b38fc..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k = ((K+1)/2) - 1;
-
- register float32x4_t a0 asm("v0");
- register float32x4_t a1 asm("v1");
- register float32x4_t b0 asm("v2");
- register float32x4_t b1 asm("v3");
- register float32x4_t b2 asm("v4");
- register float32x4_t a0a asm("v5");
- register float32x4_t a1a asm("v6");
-
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- "1:\n"
- // Unroll 0
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "nop\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
-
- ASM_PREFETCH("[%[a_ptr], #320]")
- "ins %[b0].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
-
- ASM_PREFETCH("[%[b_ptr], #448]")
- "nop\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
-
- "ldr %d[b1], [%[b_ptr], #64]\n"
- "nop\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
-
- ASM_PREFETCH("[%[b_ptr], #512]")
- "ins %[b1].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- // Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
- "nop\n"
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
- "ldr %d[a0], [%[a_ptr], #64]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
-
- "ldr %d[a1], [%[a_ptr], #80]\n"
- "ins %[a0].d[1], x20\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
-
- "ldr %d[b0], [%[b_ptr], #96]\n"
- "ins %[a1].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
-
- "nop\n"
- "ins %[b0].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
-
- "nop\n"
- "nop\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
-
- "ldr %d[b1], [%[b_ptr], #112]\n"
- "nop\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
-
- "nop\n"
- "ins %[b1].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
-
- "bne 1b\n"
-
- // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
- "4:\n"
- "cbnz %w[oddk], 2f\n"
-
- // Detached final iteration. (even K)
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "nop\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
-
- "ins %[b0].d[1], x20\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
-
- "nop\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
-
- "ldr %d[b1], [%[b_ptr], #64]\n"
- "nop\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
-
- "ins %[b1].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- "ldr %d[b2], [%[b_ptr], #80]\n"
- "nop\n"
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "b 3f\n"
-
- // Detached final iteration. (odd K)
- "2:\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "nop\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
-
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- // Common tail
- "3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
- }
-}
-
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
deleted file mode 100644
index 85d8a502f8..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k = ((K+1)/2) - 1;
-
- register float32x4_t a0 asm("v0");
- register float32x4_t a1 asm("v1");
- register float32x4_t b0 asm("v2");
- register float32x4_t b1 asm("v3");
- register float32x4_t b2 asm("v4");
- register float32x4_t a0a asm("v5");
- register float32x4_t a1a asm("v6");
-
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- "1:\n"
- // Unroll 0
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
-
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "subs %w[k], %w[k], #1\n"
-
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
-
- "ldr %d[b1], [%[b_ptr], #64]\n"
- "ins %[b0].d[1], x20\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
-
-
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- ASM_PREFETCH("[%[b_ptr], #512]")
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- // Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
- "ins %[b1].d[1], x20\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
-
- "ldr %d[a0], [%[a_ptr], #64]\n"
- "ins %[b2].d[1], x20\n"
-
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
-
- "ldr %d[a1], [%[a_ptr], #80]\n"
- "ins %[a0].d[1], x20\n"
-
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
-
-
- "ldr %d[b0], [%[b_ptr], #96]\n"
- "ins %[a1].d[1], x20\n"
-
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
-
- "ldr %d[b1], [%[b_ptr], #112]\n"
- "ins %[b0].d[1], x20\n"
-
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
-
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
-
-
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "ins %[b1].d[1], x20\n"
-
-
- "bne 1b\n"
-
- // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
- "4:\n"
- "cbnz %w[oddk], 2f\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
-
- // Detached final iteration. (even K)
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
-
- "ldr %d[a0a], [%[a_ptr], #32]\n"
- "ins %[b2].d[1], x20\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
-
- "ldr %d[a1a], [%[a_ptr], #48]\n"
- "ins %[a0a].d[1], x20\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
-
- "ldr %d[b0], [%[b_ptr], #48]\n"
- "ins %[a1a].d[1], x20\n"
-
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
-
- "ldr %d[b1], [%[b_ptr], #64]\n"
- "ins %[b0].d[1], x20\n"
-
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- "ldr %d[b2], [%[b_ptr], #80]\n"
- "ins %[b1].d[1], x20\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
-
- "ins %[b2].d[1], x20\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "b 3f\n"
-
- // Detached final iteration. (odd K)
- "2:\n"
-
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
-
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- // Common tail
- "3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
deleted file mode 100644
index 295308053f..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-inline void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k = ((K+1)/2) - 1;
-
- register float32x4_t a0 asm("v0");
- register float32x4_t a1 asm("v1");
- register float32x4_t b0 asm("v2");
- register float32x4_t b1 asm("v3");
- register float32x4_t b2 asm("v4");
- register float32x4_t a0a asm("v5");
- register float32x4_t a1a asm("v6");
-
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "ldp %q[a0], %q[a1], [%[a_ptr]]\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
-
- ASM_PREFETCH("[%[a_ptr], #128]")
- ASM_PREFETCH("[%[a_ptr], #192]")
- "ldp %q[b0], %q[b1], [%[b_ptr]]\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
-
- ASM_PREFETCH("[%[b_ptr], #128]")
- ASM_PREFETCH("[%[b_ptr], #192]")
- ASM_PREFETCH("[%[b_ptr], #256]")
-
- ASM_PREFETCH("[%[a_ptr], #256]")
- ASM_PREFETCH("[%[a_ptr], #320]")
- ASM_PREFETCH("[%[a_ptr], #384]")
-
- ASM_PREFETCH("[%[b_ptr], #320]")
- ASM_PREFETCH("[%[b_ptr], #384]")
- ASM_PREFETCH("[%[b_ptr], #448]")
- ASM_PREFETCH("[%[b_ptr], #512]")
-
- "movi v8.4s, #0x0\n"
- "movi v9.4s, #0x0\n"
- "movi v10.4s, #0x0\n"
- "movi v11.4s, #0x0\n"
- "movi v12.4s, #0x0\n"
- "movi v13.4s, #0x0\n"
- "movi v14.4s, #0x0\n"
- "movi v15.4s, #0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
-
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- "1:\n"
- // Unroll 0
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- ASM_PREFETCH("[%[a_ptr], #448]")
-
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- ASM_PREFETCH("[%[b_ptr], #576]")
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- // Unroll 1
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "ldr %d[a0], [%[a_ptr], #64]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "ldr x20, [%[a_ptr], #72]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "ldr %d[a1], [%[a_ptr], #80]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "ins %[a0].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "ldr x20, [%[a_ptr], #88]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #96]\n"
-
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "ins %[a1].d[1], x20\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "ldr x20, [%[b_ptr], #104]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #120]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
-
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
-
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- ASM_PREFETCH("[%[b_ptr], #640]")
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
-
-
- "bne 1b\n"
-
- // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
- "4:\n"
- "cbnz %w[oddk], 2f\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
-
- // Detached final iteration. (even K)
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %d[a0a], [%[a_ptr], #32]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr x20, [%[a_ptr], #40]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %d[a1a], [%[a_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "ins %[a0a].d[1], x20\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "ldr x20, [%[a_ptr], #56]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
-
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "ldr %d[b0], [%[b_ptr], #48]\n"
-
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "ins %[a1a].d[1], x20\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #56]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "ldr %d[b1], [%[b_ptr], #64]\n"
-
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "ins %[b0].d[1], x20\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "ldr x20, [%[b_ptr], #72]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
-
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %d[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "ins %[b1].d[1], x20\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr x20, [%[b_ptr], #88]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "ins %[b2].d[1], x20\n"
-
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "b 3f\n"
-
- // Detached final iteration. (odd K)
- "2:\n"
-
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr %d[b2], [%[b_ptr], #32]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr x20, [%[b_ptr], #40]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "ins %[b2].d[1], x20\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
-
- // Common tail
- "3:\n"
- "str q8, [%[c_ptr]]\n"
- "str q16, [%[c_ptr], #16]\n"
- "str q24, [%[c_ptr], #32]\n"
- "str q9, [%[c_ptr], #48]\n"
- "str q17, [%[c_ptr], #64]\n"
- "str q25, [%[c_ptr], #80]\n"
- "str q10, [%[c_ptr], #96]\n"
- "str q18, [%[c_ptr], #112]\n"
- "str q26, [%[c_ptr], #128]\n"
- "str q11, [%[c_ptr], #144]\n"
- "str q19, [%[c_ptr], #160]\n"
- "str q27, [%[c_ptr], #176]\n"
- "str q12, [%[c_ptr], #192]\n"
- "str q20, [%[c_ptr], #208]\n"
- "str q28, [%[c_ptr], #224]\n"
- "str q13, [%[c_ptr], #240]\n"
- "str q21, [%[c_ptr], #256]\n"
- "str q29, [%[c_ptr], #272]\n"
- "str q14, [%[c_ptr], #288]\n"
- "str q22, [%[c_ptr], #304]\n"
- "str q30, [%[c_ptr], #320]\n"
- "str q15, [%[c_ptr], #336]\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
deleted file mode 100644
index c4a5875a31..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-
-// Kernel implementation.
-//
-// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
-// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
-// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 12x8), the chunks being arranged in a row major fashion.
-//
-// Note that the intent of this is that either ablocks or bblocks will be 1
-// - this construction allows the output loop to proceed in either order.
-
-inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump=0, long int block_jump=0) {
- const float *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- for (int yb=0; yb<ablocks; yb++) {
- const float *a_ptr0 = a_ptr;
- const float *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- // Fix up for odd lengths - set a flag if K is odd, but make
- // sure we round up the iteration count.
- int oddk = (K & 1);
- int k = ((K+1)/2) - 1;
-
- register float32x4_t a0 asm("v0");
- register float32x4_t a1 asm("v1");
- register float32x4_t b0 asm("v2");
- register float32x4_t b1 asm("v3");
- register float32x4_t b2 asm("v4");
- register float32x4_t a0a asm("v5");
- register float32x4_t a1a asm("v6");
-
- __asm __volatile (
- // Initialize result registers, load initial operands, prime prefetches.
- "movi v8.4s, #0x0\n"
- "ldr %q[a0], [%[a_ptr]]\n"
- "movi v9.4s, #0x0\n"
- "ldr %q[b0], [%[b_ptr]]\n"
- "movi v10.4s, #0x0\n"
- "ldr %q[a1], [%[a_ptr], #16]\n"
- "movi v11.4s, #0x0\n"
- "ldr %q[b1], [%[b_ptr], #16]\n"
- "movi v12.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #64]")
- "movi v13.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #64]")
- "movi v14.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #128]")
- "movi v15.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #128]")
- "movi v16.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #192]")
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #256]")
- "movi v18.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #192]")
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #320]")
- "movi v20.4s, #0x0\n"
- ASM_PREFETCH("[%[a_ptr], #256]")
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[b_ptr], #384]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip loop if we are doing zero iterations of it.
- "cbz %w[k], 4f\n"
-
- // Loop proper
- "1:\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- ASM_PREFETCH("[%[a_ptr], #320]")
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- ASM_PREFETCH("[%[b_ptr], #448]")
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "ldr %q[a0], [%[a_ptr], #64]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "ldr %q[a1], [%[a_ptr], #80]\n"
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #96]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- ASM_PREFETCH("[%[b_ptr], #512]")
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #112]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "bne 1b\n"
-
- // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
- "4:\n"
-
- // Branch to alternative tail for odd K
- "cbnz %w[oddk], 2f\n"
-
- // Detached final iteration (even K)
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "ldr %q[a0a], [%[a_ptr], #32]\n"
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "ldr %q[a1a], [%[a_ptr], #48]\n"
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "ldr %q[b0], [%[b_ptr], #48]\n"
-
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "ldr %q[b1], [%[b_ptr], #64]\n"
-
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "add %[a_ptr], %[a_ptr], #64\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "ldr %q[b2], [%[b_ptr], #80]\n"
-
- "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
- "add %[b_ptr], %[b_ptr], %[block_jump]\n"
- "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
- "add %[b_ptr], %[b_ptr], #96\n"
- "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "str q8, [%[c_ptr], #0]\n"
- "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
- "str q24, [%[c_ptr], #32]\n"
-
- "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
- "str q9, [%[c_ptr], #48]\n"
- "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- "b 3f\n"
-
- // Detached final iteration (odd K)
- "2:\n"
- "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
- "ldr %q[b2], [%[b_ptr], #32]\n"
- "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
- "add %[b_ptr], %[b_ptr], %[row_jump]\n"
- "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
- "str q8, [%[c_ptr], #0]\n"
- "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
- "str q16, [%[c_ptr], #16]\n"
- "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
- "add %[b_ptr], %[b_ptr], #48\n"
- "add %[a_ptr], %[a_ptr], #32\n"
- "str q24, [%[c_ptr], #32]\n"
- "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
- "str q9, [%[c_ptr], #48]\n"
-
- "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
- "str q17, [%[c_ptr], #64]\n"
- "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
- "str q25, [%[c_ptr], #80]\n"
- "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
- "str q10, [%[c_ptr], #96]\n"
-
- "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
- "str q18, [%[c_ptr], #112]\n"
- "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
- "str q26, [%[c_ptr], #128]\n"
- "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
- "str q11, [%[c_ptr], #144]\n"
-
- "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
- "str q19, [%[c_ptr], #160]\n"
- "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
- "str q27, [%[c_ptr], #176]\n"
- "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
- "str q12, [%[c_ptr], #192]\n"
-
- "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
- "str q20, [%[c_ptr], #208]\n"
- "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
- "str q28, [%[c_ptr], #224]\n"
- "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
- "str q13, [%[c_ptr], #240]\n"
-
- "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
- "str q21, [%[c_ptr], #256]\n"
- "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
- "str q29, [%[c_ptr], #272]\n"
- "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
- "str q14, [%[c_ptr], #288]\n"
-
- "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
- "str q22, [%[c_ptr], #304]\n"
- "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
- "str q30, [%[c_ptr], #320]\n"
- "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
- "str q15, [%[c_ptr], #336]\n"
-
- // Common tail
- "3:\n"
- "str q23, [%[c_ptr], #352]\n"
- "str q31, [%[c_ptr], #368]\n"
- "add %[c_ptr], %[c_ptr], #384\n"
- :
- [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
- [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
- : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
- : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
- "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
- }
-}
-
-inline void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp
deleted file mode 100644
index 33f2b701cf..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/kernels/generic.hpp
+++ /dev/null
@@ -1,913 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-// Kernel implementation - transposed GEMV
-//
-// The kernel will process "M" rows of A (= steps of dot product) and "N"
-// columns (= dot products total)
-//
-// General plan is to do as many columns simultaneously as possible - a
-// reasonable limit is half the NEON regfile = 64 total accumulators.
-//
-// It's possible that messing around with sub-blocking M and N can yield
-// higher performance, but that's left to the outer loop. In this kernel we
-// process all of M at the same time.
-
-
-// How far ahead to prefetch for the first and subsequent prefetches.
-// These values work for A72 on JunoR2...
-
-#define FIRST_PFD 9
-#define PFD 6
-
-inline void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N) {
- const float *a_ptr_base = Astart;
- float *y_ptr = Ystart;
-
- register const float32x4_t va asm("v1") = vdupq_n_f32(alpha);
-
- int firstpfd=FIRST_PFD;
- if (firstpfd > M) {
- firstpfd = (M-1);
- }
-
- int pfd = PFD;
- if (pfd > M) {
- pfd = (M-1);
- }
-
- ptrdiff_t jump = lda * sizeof(int);
-
- for (;N>=96;N-=96) {
- int k = M-1;
-
- const float *a_ptr = a_ptr_base;
- const float *x_ptr = Xstart;
- const float *pf_ptr = a_ptr;
- const float *firstpf_ptr = a_ptr;
- const float *pf_limit = a_ptr + (M * lda);
-
- for (int i=0; i<firstpfd; i++) {
- prefetch_1x(firstpf_ptr);
- firstpf_ptr += lda;
- }
-
- for (int i=0; i<pfd; i++) {
- prefetch_5x(pf_ptr + 16);
- pf_ptr += lda;
- }
-
- a_ptr_base += 96;
-
- __asm __volatile (
- "movi v8.4s,#0x0\n"
- "ldr w0, [%[x_ptr]]\n"
- "movi v9.4s,#0x0\n"
- "ldr q2, [%[a_ptr], #0]\n"
- "movi v10.4s,#0x0\n"
- "ldr q3, [%[a_ptr], #0x10]\n"
- "movi v11.4s,#0x0\n"
- "ldr q4, [%[a_ptr], #0x20]\n"
- "movi v12.4s,#0x0\n"
- "ldr q5, [%[a_ptr], #0x30]\n"
- "movi v13.4s,#0x0\n"
- "ldr q6, [%[a_ptr], #0x40]\n"
- "movi v14.4s,#0x0\n"
- "ldr q7, [%[a_ptr], #0x50]\n"
- "movi v15.4s,#0x0\n"
- ASM_PREFETCH("[%[firstpf_ptr]]")
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- ASM_PREFETCH("[%[pf_ptr], #64]")
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- ASM_PREFETCH("[%[pf_ptr], #128]")
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- ASM_PREFETCH("[%[pf_ptr], #192]")
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- ASM_PREFETCH("[%[pf_ptr], #256]")
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- ASM_PREFETCH("[%[pf_ptr], #320]")
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "add %[pf_ptr], %[pf_ptr], %[jump]\n"
- "movi v28.4s, #0x0\n"
- "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v31.4s, #0x0\n"
-
- // Skip everything if there are no iterations of the main loop to do.
- "cbz %w[k], 10f\n"
-
- // Loop with all prefetches. Exit this loop when firstpf_ptr
- // hits pf_limit.
- "1:\n"
- "dup v0.4s, w0\n"
- "ldr w0, [%[x_ptr], #4]\n"
- "add %[x_ptr], %[x_ptr], #0x4\n"
- "fmla v8.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x60]\n"
- "fmla v9.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x70]\n"
- ASM_PREFETCH("[%[firstpf_ptr]]")
- "fmla v10.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x80]\n"
- "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
- "fmla v11.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x90]\n"
- "sub %w[k], %w[k], #1\n"
- ASM_PREFETCH("[%[x_ptr], #128]")
- "fmla v12.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0xa0]\n"
- "fmla v13.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0xb0]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x40]")
- "fmla v14.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0xc0]\n"
- "fmla v15.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0xd0]\n"
- "fmla v16.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0xe0]\n"
- "fmla v17.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0xf0]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x80]")
- "fmla v18.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x100]\n"
- "fmla v19.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x110]\n"
- "fmla v20.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x120]\n"
- "fmla v21.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x130]\n"
- ASM_PREFETCH("[%[pf_ptr], #0xc0]")
- "fmla v22.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x140]\n"
- "fmla v23.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x150]\n"
- "fmla v24.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x160]\n"
- "fmla v25.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x170]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x100]")
- "add %[a_ptr], %[a_ptr], %[jump]\n"
- "fmla v26.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x00]\n"
- "fmla v27.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x10]\n"
- "fmla v28.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x20]\n"
- "fmla v29.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x30]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x140]")
- "fmla v30.4s, v6.4s, v0.4s\n"
- "add %[pf_ptr], %[pf_ptr], %[jump]\n"
- "ldr q6, [%[a_ptr], #0x40]\n"
- "fmla v31.4s, v7.4s, v0.4s\n"
- "cmp %[firstpf_ptr], %[pf_limit]\n"
- "ldr q7, [%[a_ptr], #0x50]\n"
- "blt 1b\n"
-
- // Check that there are still "main" prefetches to do.
- "cmp %[pf_ptr], %[pf_limit]\n"
- "bge 9f\n"
-
- // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
- "8:\n"
- "dup v0.4s, w0\n"
- "ldr w0, [%[x_ptr], #4]\n"
- "add %[x_ptr], %[x_ptr], #0x4\n"
- "fmla v8.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x60]\n"
- "fmla v9.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x70]\n"
- "fmla v10.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x80]\n"
- "fmla v11.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x90]\n"
- "sub %w[k], %w[k], #1\n"
- ASM_PREFETCH("[%[x_ptr], #128]")
- "fmla v12.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0xa0]\n"
- "fmla v13.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0xb0]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x40]")
- "fmla v14.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0xc0]\n"
- "fmla v15.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0xd0]\n"
- "fmla v16.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0xe0]\n"
- "fmla v17.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0xf0]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x80]")
- "fmla v18.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x100]\n"
- "fmla v19.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x110]\n"
- "fmla v20.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x120]\n"
- "fmla v21.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x130]\n"
- ASM_PREFETCH("[%[pf_ptr], #0xc0]")
- "fmla v22.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x140]\n"
- "fmla v23.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x150]\n"
- "fmla v24.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x160]\n"
- "fmla v25.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x170]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x100]")
- "add %[a_ptr], %[a_ptr], %[jump]\n"
- "fmla v26.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x00]\n"
- "fmla v27.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x10]\n"
- "fmla v28.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x20]\n"
- "fmla v29.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x30]\n"
- ASM_PREFETCH("[%[pf_ptr], #0x140]")
- "fmla v30.4s, v6.4s, v0.4s\n"
- "add %[pf_ptr], %[pf_ptr], %[jump]\n"
- "ldr q6, [%[a_ptr], #0x40]\n"
- "fmla v31.4s, v7.4s, v0.4s\n"
- "cmp %[pf_ptr], %[pf_limit]\n"
- "ldr q7, [%[a_ptr], #0x50]\n"
- "blt 8b\n"
-
- // Check that there is still work to do.
- "9:\n"
- "cmp %w[k], #0\n"
- "beq 10f\n"
-
- // Loop without prefetches, exit when k hits 0.
- "2:\n"
- "dup v0.4s, w0\n"
- "ldr w0, [%[x_ptr], #4]\n"
- "add %[x_ptr], %[x_ptr], #0x4\n"
- "fmla v8.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x60]\n"
- "fmla v9.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x70]\n"
- "fmla v10.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x80]\n"
- "fmla v11.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x90]\n"
- "subs %w[k], %w[k], #1\n"
- "fmla v12.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0xa0]\n"
- "fmla v13.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0xb0]\n"
- "fmla v14.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0xc0]\n"
- "fmla v15.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0xd0]\n"
- "fmla v16.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0xe0]\n"
- "fmla v17.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0xf0]\n"
- "fmla v18.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x100]\n"
- "fmla v19.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x110]\n"
- "fmla v20.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x120]\n"
- "fmla v21.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x130]\n"
- "fmla v22.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x140]\n"
- "fmla v23.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x150]\n"
- "fmla v24.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x160]\n"
- "fmla v25.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x170]\n"
- "add %[a_ptr], %[a_ptr], %[jump]\n"
- "fmla v26.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x00]\n"
- "fmla v27.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x10]\n"
- "fmla v28.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x20]\n"
- "fmla v29.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x30]\n"
- "fmla v30.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x40]\n"
- "fmla v31.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x50]\n"
- "bne 2b\n"
-
- "10:\n"
-
- // Final iteration
- "dup v0.4s, w0\n"
- "fmla v8.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x60]\n"
- "fmla v9.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x70]\n"
- "fmla v10.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x80]\n"
- "fmla v11.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x90]\n"
- "fmla v12.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0xa0]\n"
- "fmla v13.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0xb0]\n"
- "fmla v14.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0xc0]\n"
- "fmla v15.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0xd0]\n"
- "fmla v16.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0xe0]\n"
- "fmla v17.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0xf0]\n"
- "fmla v18.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x100]\n"
- "fmla v19.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x110]\n"
- "fmla v20.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[a_ptr], #0x120]\n"
- "fmla v21.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[a_ptr], #0x130]\n"
- "fmla v22.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[a_ptr], #0x140]\n"
- "fmla v23.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[a_ptr], #0x150]\n"
- "fmla v24.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[a_ptr], #0x160]\n"
- "fmla v25.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[a_ptr], #0x170]\n"
- "fmla v26.4s, v2.4s, v0.4s\n"
- "ldr q2, [%[y_ptr]]\n"
- "fmla v27.4s, v3.4s, v0.4s\n"
- "ldr q3, [%[y_ptr], #0x10]\n"
- "fmla v28.4s, v4.4s, v0.4s\n"
- "ldr q4, [%[y_ptr], #0x20]\n"
- "fmla v29.4s, v5.4s, v0.4s\n"
- "ldr q5, [%[y_ptr], #0x30]\n"
- "fmla v30.4s, v6.4s, v0.4s\n"
- "ldr q6, [%[y_ptr], #0x40]\n"
- "fmla v31.4s, v7.4s, v0.4s\n"
- "ldr q7, [%[y_ptr], #0x50]\n"
-
- "fmla v2.4s, v8.4s, %[va].4s\n"
- "ldr q8, [%[y_ptr], #0x60]\n"
- "fmla v3.4s, v9.4s, %[va].4s\n"
- "ldr q9, [%[y_ptr], #0x70]\n"
- "fmla v4.4s, v10.4s, %[va].4s\n"
- "ldr q10, [%[y_ptr], #0x80]\n"
- "fmla v5.4s, v11.4s, %[va].4s\n"
- "ldr q11, [%[y_ptr], #0x90]\n"
- "fmla v6.4s, v12.4s, %[va].4s\n"
- "ldr q12, [%[y_ptr], #0xa0]\n"
- "str q2, [%[y_ptr], #0x00]\n"
- "fmla v7.4s, v13.4s, %[va].4s\n"
- "ldr q13, [%[y_ptr], #0xb0]\n"
- "str q3, [%[y_ptr], #0x10]\n"
- "fmla v8.4s, v14.4s, %[va].4s\n"
- "ldr q14, [%[y_ptr], #0xc0]\n"
- "str q4, [%[y_ptr], #0x20]\n"
- "fmla v9.4s, v15.4s, %[va].4s\n"
- "ldr q15, [%[y_ptr], #0xd0]\n"
- "str q5, [%[y_ptr], #0x30]\n"
- "fmla v10.4s, v16.4s, %[va].4s\n"
- "ldr q16, [%[y_ptr], #0xe0]\n"
- "str q6, [%[y_ptr], #0x40]\n"
- "fmla v11.4s, v17.4s, %[va].4s\n"
- "ldr q17, [%[y_ptr], #0xf0]\n"
- "str q7, [%[y_ptr], #0x50]\n"
- "fmla v12.4s, v18.4s, %[va].4s\n"
- "ldr q18, [%[y_ptr], #0x100]\n"
- "str q8, [%[y_ptr], #0x60]\n"
- "fmla v13.4s, v19.4s, %[va].4s\n"
- "ldr q19, [%[y_ptr], #0x110]\n"
- "str q9, [%[y_ptr], #0x70]\n"
- "fmla v14.4s, v20.4s, %[va].4s\n"
- "ldr q20, [%[y_ptr], #0x120]\n"
- "str q10, [%[y_ptr], #0x80]\n"
- "fmla v15.4s, v21.4s, %[va].4s\n"
- "ldr q21, [%[y_ptr], #0x130]\n"
- "str q11, [%[y_ptr], #0x90]\n"
- "fmla v16.4s, v22.4s, %[va].4s\n"
- "ldr q22, [%[y_ptr], #0x140]\n"
- "str q12, [%[y_ptr], #0xa0]\n"
- "fmla v17.4s, v23.4s, %[va].4s\n"
- "ldr q23, [%[y_ptr], #0x150]\n"
- "str q13, [%[y_ptr], #0xb0]\n"
- "fmla v18.4s, v24.4s, %[va].4s\n"
- "ldr q24, [%[y_ptr], #0x160]\n"
- "str q14, [%[y_ptr], #0xc0]\n"
- "fmla v19.4s, v25.4s, %[va].4s\n"
- "ldr q25, [%[y_ptr], #0x170]\n"
- "str q15, [%[y_ptr], #0xd0]\n"
- "fmla v20.4s, v26.4s, %[va].4s\n"
- "str q16, [%[y_ptr], #0xe0]\n"
- "fmla v21.4s, v27.4s, %[va].4s\n"
- "str q17, [%[y_ptr], #0xf0]\n"
- "fmla v22.4s, v28.4s, %[va].4s\n"
- "str q18, [%[y_ptr], #0x100]\n"
- "fmla v23.4s, v29.4s, %[va].4s\n"
- "str q19, [%[y_ptr], #0x110]\n"
- "fmla v24.4s, v30.4s, %[va].4s\n"
- "str q20, [%[y_ptr], #0x120]\n"
- "fmla v25.4s, v31.4s, %[va].4s\n"
- "str q21, [%[y_ptr], #0x130]\n"
-
- "stp q22, q23, [%[y_ptr], #0x140]\n"
- "stp q24, q25, [%[y_ptr], #0x160]\n"
- "add %[y_ptr], %[y_ptr], #0x180\n"
-
- : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr)
- : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit)
- : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
- "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
-
- if (N>0) {
- // Handle N tail - up to 95 stragglers.
- // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
- // single value for the remainder.
-
- // Independent pointers into the matrix for the odd 2 and odd 1.
- // Double up as flag to indicate whether they are needed.
- const float *odd2_aptr=NULL;
- const float *odd1_aptr=NULL;
-
- // Figure out how much work we need to do.
- int numvecs = N/4;
- int rem = N%4;
- int k=M;
-
- // Set up pointers for the odd 2/1 if needed.
- if (rem >= 2) {
- odd2_aptr = a_ptr_base + (numvecs * 4);
- }
-
- if (rem & 1) {
- odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2);
- }
-
- const float *a_ptr = a_ptr_base;
- const float *firstpf_ptr = a_ptr_base;
- const float *pf_ptr = a_ptr_base;
- const float *pf_limit = a_ptr + (M * lda);
-
- const float *x_ptr = Xstart;
- int vecs=0; // Working variable to count how many vectors to work on.
- int dopf=1; // Track whether we are doing prefetches.
-
- // Figure out how many cache lines we need to prefetch each time.
- int numpfs = (N + 15) / 16;
-
- // Do initial prefetches
- for (int i=0; i<firstpfd+1; i++) {
- prefetch_1x(firstpf_ptr);
- firstpf_ptr += lda;
- }
-
- // Do "main" prefetches - adapt number to the number we actually need.
- if (numpfs > 1) {
- for (int i=0; i<pfd+1; i++) {
- switch (numpfs) {
- case 2:
- prefetch_1x(pf_ptr + 16);
- break;
-
- case 3:
- prefetch_2x(pf_ptr + 16);
- break;
-
- case 4:
- prefetch_3x(pf_ptr + 16);
- break;
-
- case 5:
- prefetch_4x(pf_ptr + 16);
- break;
-
- case 6:
- prefetch_5x(pf_ptr + 16);
- break;
- }
- pf_ptr += lda;
- }
- } else {
- // Just disable additional prefetches
- dopf=0;
- }
-
- // Do the real work
- __asm __volatile (
- // Initialize all the vectors - not worth skipping this if only
- // some are needed.
- "movi v8.4s,#0x0\n"
- "ldr w0, [%[x_ptr]]\n"
- "movi v9.4s,#0x0\n"
- "movi v10.4s,#0x0\n"
- "movi v11.4s,#0x0\n"
- "movi v12.4s,#0x0\n"
- "movi v13.4s,#0x0\n"
- "movi v14.4s,#0x0\n"
- "movi v15.4s,#0x0\n"
- "movi v16.4s, #0x0\n"
- "movi v17.4s, #0x0\n"
- "movi v18.4s, #0x0\n"
- "movi v19.4s, #0x0\n"
- "movi v20.4s, #0x0\n"
- "movi v21.4s, #0x0\n"
- "movi v22.4s, #0x0\n"
- "movi v23.4s, #0x0\n"
- "movi v24.4s, #0x0\n"
- "movi v25.4s, #0x0\n"
- "movi v26.4s, #0x0\n"
- "movi v27.4s, #0x0\n"
- "movi v28.4s, #0x0\n"
- "movi v29.4s, #0x0\n"
- "movi v30.4s, #0x0\n"
- "movi v6.2s, #0x0\n"
- "movi v5.2s, #0x0\n"
-
- "1:\n"
- ASM_PREFETCH("[%[firstpf_ptr]]\n")
- "11:\n"
- "dup v0.4s, w0\n"
- "ldr w0, [%[x_ptr], #4]\n"
- "add %[x_ptr], %[x_ptr], #4\n"
-
- "cbz %w[numvecs], 2f\n"
- "mov %w[vecs], %w[numvecs]\n"
-
- // Vector 0
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x00]\n"
- "fmla v8.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 1
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x10]\n"
- "fmla v9.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 2
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x20]\n"
- "fmla v10.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 3
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x30]\n"
- "fmla v11.4s, v7.4s, v0.4s\n"
- // Prefetch
- "cbz %w[dopf], 3f\n"
- ASM_PREFETCH("[%[pf_ptr], #0x40]")
- "3:\n"
- "beq 2f\n"
-
- // Vector 4
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x40]\n"
- "fmla v12.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 5
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x50]\n"
- "fmla v13.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 6
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x60]\n"
- "fmla v14.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 7
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x70]\n"
- "fmla v15.4s, v7.4s, v0.4s\n"
- // Prefetch
- "cbz %w[dopf], 4f\n"
- ASM_PREFETCH("[%[pf_ptr], #0x80]")
- "4:\n"
- "beq 2f\n"
-
- // Vector 8
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x80]\n"
- "fmla v16.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 9
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x90]\n"
- "fmla v17.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 10
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0xa0]\n"
- "fmla v18.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 11
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0xb0]\n"
- "fmla v19.4s, v7.4s, v0.4s\n"
- // Prefetch
- "cbz %w[dopf], 5f\n"
- ASM_PREFETCH("[%[pf_ptr], #0xc0]")
- "5:\n"
- "beq 2f\n"
-
- // Vector 12
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0xc0]\n"
- "fmla v20.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 13
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0xd0]\n"
- "fmla v21.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 14
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0xe0]\n"
- "fmla v22.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 15
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0xf0]\n"
- "fmla v23.4s, v7.4s, v0.4s\n"
- // Prefetch
- "cbz %w[dopf], 6f\n"
- ASM_PREFETCH("[%[pf_ptr], #0x100]")
- "6:\n"
- "beq 2f\n"
-
- // Vector 16
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x100]\n"
- "fmla v24.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 17
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x110]\n"
- "fmla v25.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 18
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x120]\n"
- "fmla v26.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 19
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x130]\n"
- "fmla v27.4s, v7.4s, v0.4s\n"
- // Prefetch
- "cbz %w[dopf], 7f\n"
- ASM_PREFETCH("[%[pf_ptr], #0x140]")
- "7:\n"
- "beq 2f\n"
-
- // Vector 20
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x140]\n"
- "fmla v28.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 21
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x150]\n"
- "fmla v29.4s, v7.4s, v0.4s\n"
- "beq 2f\n"
- // Vector 22
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7,[%[a_ptr], #0x160]\n"
- "fmla v30.4s, v7.4s, v0.4s\n"
-
- "2:\n"
- "add %[a_ptr], %[a_ptr], %[jump]\n"
-
- // Do the odd 2-vector, if needed
- "cbz %[odd2_aptr], 8f\n"
- "ldr d7, [%[odd2_aptr]]\n"
- "fmla v6.2s, v7.2s, v0.2s\n"
- "add %[odd2_aptr], %[odd2_aptr], %[jump]\n"
-
- "8:\n"
- // Do the odd 1-vector, if needed
- "cbz %[odd1_aptr], 9f\n"
- "ldr s7, [%[odd1_aptr]]\n"
- "fmla v5.2s, v7.2s, v0.2s\n"
- "add %[odd1_aptr], %[odd1_aptr], %[jump]\n"
-
- // Get out if needed.
- "9:\n"
- "subs %w[k], %w[k], #1\n"
- "beq 10f\n"
-
- // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
- "add %[pf_ptr], %[pf_ptr], %[jump]\n"
- "cmp %[pf_ptr], %[pf_limit]\n"
- "csel %w[dopf], %w[dopf], WZR, LT\n"
-
- // Update the "leading" prefetch pointer, don't do the first
- // instruction of the loop if it's over the limit.
- "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
- "cmp %[firstpf_ptr], %[pf_limit]\n"
- "blt 1b\n"
- "b 11b\n"
-
- // Now write out the outputs
- "10:\n"
- "cbz %w[numvecs], 12f\n"
- "mov %w[vecs], %w[numvecs]\n"
-
- // Vector 0
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v8.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 1
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v9.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 2
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v10.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 3
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v11.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 4
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v12.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 5
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v13.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 6
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v14.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 7
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v15.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 8
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v16.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 9
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v17.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 10
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v18.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 11
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v19.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 12
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v20.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 13
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v21.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 14
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v22.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 15
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v23.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 16
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v24.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 17
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v25.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 18
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v26.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 19
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v27.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 20
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v28.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 21
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v29.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
- "beq 12f\n"
- // Vector 22
- "subs %w[vecs], %w[vecs], #1\n"
- "ldr q7, [%[y_ptr]]\n"
- "fmla v7.4s, v30.4s, %[va].4s\n"
- "str q7, [%[y_ptr]], #0x10\n"
-
- // Odd 2
- "12:\n"
- "cbz %[odd2_aptr], 13f\n"
- "ldr d7, [%[y_ptr]]\n"
- "fmla v7.2s, v6.2s, %[va].2s\n"
- "str d7, [%[y_ptr]], #0x8\n"
-
- // Odd 1
- "13:\n"
- "cbz %[odd1_aptr], 14f\n"
- "ldr s7, [%[y_ptr]]\n"
- "fmla v7.2s, v5.2s, %[va].2s\n"
- "str s7, [%[y_ptr]]\n"
-
- "14:\n"
- : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k),
- [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr),
- [odd1_aptr] "+r" (odd1_aptr), [odd2_aptr] "+r" (odd2_aptr),
- [dopf] "+r" (dopf), [vecs] "+r" (vecs)
- : [jump] "r" (jump), [va] "w" (va), [pf_limit] "r" (pf_limit), [numvecs] "r" (numvecs)
- : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
- "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
- "v27", "v28", "v29", "v30", "v31", "cc"
- );
- }
-}
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp
deleted file mode 100644
index ddd67e8ee2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/merges/a32_merge_float_8x6.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include "../asmlib.hpp"
-
-#include <arm_neon.h>
-
-template<>
-inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
- const float *inptr = in;
-// prefetch_6x(inptr);
-// prefetch_6x(inptr + 96);
-
- float32x4_t av = vdupq_n_f32(alpha);
- float32x4_t bv = vdupq_n_f32(beta);
-
- for (int y=y0; y<ymax; y+=8) {
- float *outptr0 = out + (y * ldout) + x0;
- float *outptr1 = outptr0 + ldout;
- float *outptr2 = outptr1 + ldout;
- float *outptr3 = outptr2 + ldout;
- float *outptr4 = outptr3 + ldout;
- float *outptr5 = outptr4 + ldout;
-
-// prefetch_2x(outptr0);
-// prefetch_2x(outptr1);
-// prefetch_2x(outptr2);
-// prefetch_2x(outptr3);
-// prefetch_2x(outptr4);
-// prefetch_2x(outptr5);
-
- for (int i=x0; i<xmax; i+=8) {
- float dummyres[8];
-
- /* Make sure we throw away results if Y isn't a multiple of 8.
- * We do this by pointing the result pointer at a dummy buffer
- * we later discard. */
- if ((y+5) >= ymax) {
- switch ((y + 5) - ymax) {
- case 4:
- outptr1 = dummyres;
- case 3:
- outptr2 = dummyres;
- case 2:
- outptr3 = dummyres;
- case 1:
- outptr4 = dummyres;
- case 0:
- outptr5 = dummyres;
- default:
- break;
- }
- }
-
- /* For ragged X, manually copy over the valid results. */
- if ((i+7) >= xmax) {
- for (int xi=0; xi<8; xi++) {
- if ((i+xi) < xmax) {
- *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
- outptr0++;
- *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
- outptr1++;
- *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
- outptr2++;
- *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
- outptr3++;
- *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
- outptr4++;
- *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
- outptr5++;
- }
- }
- inptr += 48;
- } else {
- /* Optimized routine to copy an entire block */
- __asm __volatile (
- // Rows 0-1
- "VLD1.32 {d8-d11}, [%[outptr0]]\n"
- "VMUL.f32 q4, q4, %q[bv]\n"
- "VLD1.32 {d12-d15}, [%[outptr1]]\n"
- "VMUL.f32 q5, q5, %q[bv]\n"
- "VLD1.32 {d0-d3}, [%[inptr]]!\n"
- "VMUL.f32 q6, q6, %q[bv]\n"
- "VLD1.32 {d4-d7}, [%[inptr]]!\n"
- "VMUL.f32 q7, q7, %q[bv]\n"
-
- "VMLA.f32 q4, q0, %q[av]\n"
- ASM_PREFETCH("[%[inptr], #352]")
- "VMLA.f32 q5, q1, %q[av]\n"
- "VST1.32 {d8-d11}, [%[outptr0]]!\n"
- ASM_PREFETCH("[%[inptr], #416]")
- "VMLA.f32 q6, q2, %q[av]\n"
- ASM_PREFETCH("[%[inptr], #480]")
- "VMLA.f32 q7, q3, %q[av]\n"
- "VST1.32 {d12-d15}, [%[outptr1]]!\n"
-
- // Rows 2-3
- "VLD1.32 {d8-d11}, [%[outptr2]]\n"
- "VMUL.f32 q4, q4, %q[bv]\n"
- "VLD1.32 {d12-d15}, [%[outptr3]]\n"
- "VMUL.f32 q5, q5, %q[bv]\n"
- "VLD1.32 {d0-d3}, [%[inptr]]!\n"
- "VMUL.f32 q6, q6, %q[bv]\n"
- "VLD1.32 {d4-d7}, [%[inptr]]!\n"
- "VMUL.f32 q7, q7, %q[bv]\n"
-
- "VMLA.f32 q4, q0, %q[av]\n"
- ASM_PREFETCH("[%[outptr0], #96]")
- "VMLA.f32 q5, q1, %q[av]\n"
- "VST1.32 {d8-d11}, [%[outptr2]]!\n"
- ASM_PREFETCH("[%[outptr1], #96]")
- "VMLA.f32 q6, q2, %q[av]\n"
- ASM_PREFETCH("[%[outptr2], #96]")
- "VMLA.f32 q7, q3, %q[av]\n"
- "VST1.32 {d12-d15}, [%[outptr3]]!\n"
-
- // Rows 4-5
- "VLD1.32 {d8-d11}, [%[outptr4]]\n"
- "VMUL.f32 q4, q4, %q[bv]\n"
- "VLD1.32 {d12-d15}, [%[outptr5]]\n"
- "VMUL.f32 q5, q5, %q[bv]\n"
- "VLD1.32 {d0-d3}, [%[inptr]]!\n"
- "VMUL.f32 q6, q6, %q[bv]\n"
- "VLD1.32 {d4-d7}, [%[inptr]]!\n"
- "VMUL.f32 q7, q7, %q[bv]\n"
-
- "VMLA.f32 q4, q0, %q[av]\n"
- ASM_PREFETCH("[%[outptr3], #96]")
- "VMLA.f32 q5, q1, %q[av]\n"
- "VST1.32 {d8-d11}, [%[outptr4]]!\n"
- ASM_PREFETCH("[%[outptr4], #96]")
- "VMLA.f32 q6, q2, %q[av]\n"
- ASM_PREFETCH("[%[outptr5], #128]")
- "VMLA.f32 q7, q3, %q[av]\n"
- "VST1.32 {d12-d15}, [%[outptr5]]!\n"
- : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
- [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [inptr] "+r" (inptr)
- : [av] "w" (av), [bv] "w" (bv)
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
- );
- }
- }
- }
-}
-
-#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
deleted file mode 100644
index e8edddb4f4..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "../asmlib.hpp"
-
-template<>
-inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta) {
- const float *inptr = in;
- prefetch_6x(inptr);
- prefetch_6x(inptr + 96);
-
- float32x4_t av = vdupq_n_f32(alpha);
- float32x4_t bv = vdupq_n_f32(beta);
-
- for (int y=y0; y<ymax; y+=8) {
- float *outptr0 = out + (y * ldout) + x0;
- float *outptr1 = outptr0 + ldout;
- float *outptr2 = outptr1 + ldout;
- float *outptr3 = outptr2 + ldout;
- float *outptr4 = outptr3 + ldout;
- float *outptr5 = outptr4 + ldout;
- float *outptr6 = outptr5 + ldout;
- float *outptr7 = outptr6 + ldout;
-
- prefetch_2x(outptr0);
- prefetch_2x(outptr1);
- prefetch_2x(outptr2);
- prefetch_2x(outptr3);
- prefetch_2x(outptr4);
- prefetch_2x(outptr5);
- prefetch_2x(outptr6);
- prefetch_2x(outptr7);
-
- for (int i=x0; i<xmax; i+=12) {
- float dummyres[12];
-
- /* Make sure we throw away results if Y isn't a multiple of 8.
- * We do this by pointing the result pointer at a dummy buffer
- * we later discard. */
- if ((y+7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- outptr1 = dummyres;
- case 5:
- outptr2 = dummyres;
- case 4:
- outptr3 = dummyres;
- case 3:
- outptr4 = dummyres;
- case 2:
- outptr5 = dummyres;
- case 1:
- outptr6 = dummyres;
- case 0:
- outptr7 = dummyres;
- default:
- break;
- }
- }
-
- /* For ragged X, manually copy over the valid results. */
- if ((i+11) >= xmax) {
- for (int xi=0; xi<12; xi++) {
- if ((i+xi) < xmax) {
- *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
- outptr0++;
- *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
- outptr1++;
- *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
- outptr2++;
- *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
- outptr3++;
- *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
- outptr4++;
- *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
- outptr5++;
- *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
- outptr6++;
- *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
- outptr7++;
- }
- }
- inptr += 96;
- } else {
- /* Optimized routine to copy an entire block */
- __asm __volatile (
- // Rows 0-1
- "LDP q16, q17, [%[outptr0]]\n"
- "FMUL v16.4s, v16.4s, %[bv].4s\n"
- "LDR q18, [%[outptr0], #32]\n"
- "FMUL v17.4s, v17.4s, %[bv].4s\n"
- "LDP q19, q20, [%[outptr1]]\n"
- "FMUL v18.4s, v18.4s, %[bv].4s\n"
- "LDR q21, [%[outptr1], #32]\n"
- ASM_PREFETCH("[%[inptr], #768]")
- "FMUL v19.4s, v19.4s, %[bv].4s\n"
- "LDP q0, q1, [%[inptr]]\n"
- "FMUL v20.4s, v20.4s, %[bv].4s\n"
- "LDP q2, q3, [%[inptr], #32]\n"
- "FMUL v21.4s, v21.4s, %[bv].4s\n"
- "LDP q4, q5, [%[inptr], #64]\n"
- "FMLA v16.4s, v0.4s, %[av].4s\n"
- ASM_PREFETCH("[%[inptr], #832]")
- "FMLA v17.4s, v1.4s, %[av].4s\n"
- "STP q16, q17, [%[outptr0]], #32\n"
- "FMLA v18.4s, v2.4s, %[av].4s\n"
- "STR q18, [%[outptr0]], #16\n"
- "FMLA v19.4s, v3.4s, %[av].4s\n"
- ASM_PREFETCH("[%[inptr], #896]")
- "FMLA v20.4s, v4.4s, %[av].4s\n"
- "STP q19, q20, [%[outptr1]], #32\n"
- "FMLA v21.4s, v5.4s, %[av].4s\n"
- "STR q21, [%[outptr1]], #16\n"
-
- // Rows 2-3
- "LDP q16, q17, [%[outptr2]]\n"
- "FMUL v16.4s, v16.4s, %[bv].4s\n"
- "LDR q18, [%[outptr2], #32]\n"
- "FMUL v17.4s, v17.4s, %[bv].4s\n"
- "LDP q19, q20, [%[outptr3]]\n"
- "FMUL v18.4s, v18.4s, %[bv].4s\n"
- "LDR q21, [%[outptr3], #32]\n"
- ASM_PREFETCH("[%[inptr], #960]")
- "FMUL v19.4s, v19.4s, %[bv].4s\n"
- "LDP q0, q1, [%[inptr], #96]\n"
- "FMUL v20.4s, v20.4s, %[bv].4s\n"
- "LDP q2, q3, [%[inptr], #128]\n"
- "FMUL v21.4s, v21.4s, %[bv].4s\n"
- "LDP q4, q5, [%[inptr], #160]\n"
- "FMLA v16.4s, v0.4s, %[av].4s\n"
- ASM_PREFETCH("[%[inptr], #1024]")
- "FMLA v17.4s, v1.4s, %[av].4s\n"
- "STP q16, q17, [%[outptr2]], #32\n"
- "FMLA v18.4s, v2.4s, %[av].4s\n"
- "STR q18, [%[outptr2]], #16\n"
- "FMLA v19.4s, v3.4s, %[av].4s\n"
- ASM_PREFETCH("[%[inptr], #1088]")
- "FMLA v20.4s, v4.4s, %[av].4s\n"
- "STP q19, q20, [%[outptr3]], #32\n"
- "FMLA v21.4s, v5.4s, %[av].4s\n"
- "STR q21, [%[outptr3]], #16\n"
-
- // Rows 4-5
- ASM_PREFETCH("[%[outptr0], #80]")
- "LDP q16, q17, [%[outptr4]]\n"
- "FMUL v16.4s, v16.4s, %[bv].4s\n"
- "LDR q18, [%[outptr4], #32]\n"
- "FMUL v17.4s, v17.4s, %[bv].4s\n"
- "LDP q19, q20, [%[outptr5]]\n"
- "FMUL v18.4s, v18.4s, %[bv].4s\n"
- "LDR q21, [%[outptr5], #32]\n"
- ASM_PREFETCH("[%[outptr1], #80]")
- "FMUL v19.4s, v19.4s, %[bv].4s\n"
- "LDP q0, q1, [%[inptr], #192]\n"
- "FMUL v20.4s, v20.4s, %[bv].4s\n"
- "LDP q2, q3, [%[inptr], #224]\n"
- "FMUL v21.4s, v21.4s, %[bv].4s\n"
- "LDP q4, q5, [%[inptr], #256]\n"
- "FMLA v16.4s, v0.4s, %[av].4s\n"
- ASM_PREFETCH("[%[outptr2], #80]")
- "FMLA v17.4s, v1.4s, %[av].4s\n"
- "STP q16, q17, [%[outptr4]], #32\n"
- "FMLA v18.4s, v2.4s, %[av].4s\n"
- "STR q18, [%[outptr4]], #16\n"
- "FMLA v19.4s, v3.4s, %[av].4s\n"
- ASM_PREFETCH("[%[outptr3], #80]")
- "FMLA v20.4s, v4.4s, %[av].4s\n"
- "STP q19, q20, [%[outptr5]], #32\n"
- "FMLA v21.4s, v5.4s, %[av].4s\n"
- "STR q21, [%[outptr5]], #16\n"
-
- // Rows 6-7
- ASM_PREFETCH("[%[outptr4], #80]")
- "LDP q16, q17, [%[outptr6]]\n"
- "FMUL v16.4s, v16.4s, %[bv].4s\n"
- "LDR q18, [%[outptr6], #32]\n"
- "FMUL v17.4s, v17.4s, %[bv].4s\n"
- "LDP q19, q20, [%[outptr7]]\n"
- "FMUL v18.4s, v18.4s, %[bv].4s\n"
- "LDR q21, [%[outptr7], #32]\n"
- ASM_PREFETCH("[%[outptr5], #80]")
- "FMUL v19.4s, v19.4s, %[bv].4s\n"
- "LDP q0, q1, [%[inptr], #288]\n"
- "FMUL v20.4s, v20.4s, %[bv].4s\n"
- "LDP q2, q3, [%[inptr], #320]\n"
- "FMUL v21.4s, v21.4s, %[bv].4s\n"
- "LDP q4, q5, [%[inptr], #352]\n"
- "FMLA v16.4s, v0.4s, %[av].4s\n"
- ASM_PREFETCH("[%[outptr6], #128]")
- "FMLA v17.4s, v1.4s, %[av].4s\n"
- "STP q16, q17, [%[outptr6]], #32\n"
- "FMLA v18.4s, v2.4s, %[av].4s\n"
- "STR q18, [%[outptr6]], #16\n"
- "FMLA v19.4s, v3.4s, %[av].4s\n"
- ASM_PREFETCH("[%[outptr7], #128]")
- "FMLA v20.4s, v4.4s, %[av].4s\n"
- "STP q19, q20, [%[outptr7]], #32\n"
- "FMLA v21.4s, v5.4s, %[av].4s\n"
- "STR q21, [%[outptr7]], #16\n"
- "ADD %[inptr], %[inptr], #384\n"
- : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3),
- [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
- [inptr] "+r" (inptr)
- : [av] "w" (av), [bv] "w" (bv)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21"
- );
- }
- }
- }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
new file mode 100644
index 0000000000..b7cc3d773b
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/newgemm_lib.hpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fstream>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <thread>
+
+extern int l1_cache_size;
+extern int l2_cache_size;
+extern int force_cpu;
+
+#ifdef __ANDROID__
+inline unsigned long stoul( const std::string& str, std::size_t* pos = 0, int base = 10 )
+{
+ char *end;
+ const unsigned long ret = strtoul( str.c_str(), &end, base);
+ *pos = end - str.c_str();
+ return ret;
+}
+inline int stoi( const std::string& str, std::size_t* pos = 0, int base = 10 )
+{
+ return atoi(str.c_str());
+}
+#endif
+
+
+#ifndef BARE_METAL
+#include <sys/auxv.h>
+
+/* Get HWCAP bits from asm/hwcap.h */
+#include <asm/hwcap.h>
+#endif /* !BARE_METAL */
+
+/* Make sure the bits we care about are defined, just in case asm/hwcap.h is
+ * out of date (or for bare metal mode) */
+#ifndef HWCAP_ASIMDHP
+#define HWCAP_ASIMDHP (1 << 10)
+#endif
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1 << 11)
+#endif
+
+#ifndef HWCAP_ASIMDDP
+#define HWCAP_ASIMDDP (1 << 20)
+#endif
+
+#define CPUINFO_HACK
+
+//unsigned int get_cpu_impl();
+
+
+/* CPU models - we only need to detect CPUs we have
+ * microarchitecture-specific code for.
+ *
+ * Architecture features are detected via HWCAPs.
+ */
+enum class CPUModel {
+ GENERIC = 0x0001,
+ A53 = 0x0010,
+ A55r0 = 0x0011,
+ A55r1 = 0x0012,
+};
+
+class CPUInfo
+{
+private:
+ struct PerCPUData {
+ CPUModel model = CPUModel::GENERIC;
+ uint32_t midr = 0;
+ bool model_set = false;
+ };
+
+ std::vector<PerCPUData> _percpu={};
+
+ bool _cpuid = false;
+ bool _fp16 = false;
+ bool _dotprod = false;
+
+ unsigned int L1_cache_size = 32768;
+ unsigned int L2_cache_size = 262144;
+
+ /* Convert an MIDR register value to a CPUModel enum value. */
+ CPUModel midr_to_model(const unsigned int midr) const {
+ CPUModel model;
+
+ // Unpack variant and CPU ID
+ int variant = (midr >> 20) & 0xF;
+ int cpunum = (midr >> 4) & 0xFFF;
+
+ /* Only CPUs we have code paths for are detected. All other CPUs
+ * can be safely classed as "GENERIC"
+ */
+
+ switch(cpunum) {
+ case 0xd03:
+ model = CPUModel::A53;
+ break;
+
+ case 0xd05:
+ if (variant) {
+ model = CPUModel::A55r1;
+ } else {
+ model = CPUModel::A55r0;
+ }
+ break;
+
+ default:
+ model = CPUModel::GENERIC;
+ break;
+ }
+
+ return model;
+ }
+
+ /* If the CPUID capability is present, MIDR information is provided in
+ /sys. Use that to populate the CPU model table. */
+ void populate_models_cpuid() {
+ for (unsigned long int i=0; i<_percpu.size(); i++) {
+ std::stringstream str;
+ str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
+ std::ifstream file;
+
+ file.open(str.str(), std::ios::in);
+
+ if (file.is_open()) {
+ std::string line;
+
+ if (bool(getline(file, line))) {
+ const unsigned long midr = stoul(line, nullptr, 16);
+
+ _percpu[i].midr = (midr & 0xffffffff);
+ _percpu[i].model = midr_to_model(_percpu[i].midr);
+ _percpu[i].model_set = true;
+ }
+ }
+ }
+ }
+
+ /* If "long-form" cpuinfo is present, parse that to populate models. */
+ void populate_models_cpuinfo() {
+ std::regex proc_regex("^processor.*(\\d+)$");
+ std::regex imp_regex("^CPU implementer.*0x(..)$");
+ std::regex var_regex("^CPU variant.*0x(.)$");
+ std::regex part_regex("^CPU part.*0x(...)$");
+ std::regex rev_regex("^CPU revision.*(\\d+)$");
+
+ std::ifstream file;
+ file.open("/proc/cpuinfo", std::ios::in);
+
+ if (file.is_open()) {
+ std::string line;
+ int midr=0;
+ int curcpu=-1;
+
+ while(bool(getline(file, line))) {
+ std::smatch match;
+
+ if (std::regex_match(line, match, proc_regex)) {
+ std::string id = match[1];
+ int newcpu=stoi(id, nullptr, 0);
+
+ if (curcpu >= 0 && midr==0) {
+ // Matched a new CPU ID without any description of the previous one - looks like old format.
+ return;
+ }
+
+ if (curcpu >= 0) {
+ _percpu[curcpu].midr = midr;
+ _percpu[curcpu].model = midr_to_model(midr);
+ _percpu[curcpu].model_set = true;
+
+ printf("CPU %d: %x\n",curcpu,midr);
+ }
+
+ midr=0;
+ curcpu=newcpu;
+
+ continue;
+ }
+
+ if (std::regex_match(line, match, imp_regex)) {
+ int impv = stoi(match[1], nullptr, 16);
+ midr |= (impv << 24);
+ continue;
+ }
+
+ if (std::regex_match(line, match, var_regex)) {
+ int varv = stoi(match[1], nullptr, 16);
+ midr |= (varv << 16);
+ continue;
+ }
+
+ if (std::regex_match(line, match, part_regex)) {
+ int partv = stoi(match[1], nullptr, 16);
+ midr |= (partv << 4);
+ continue;
+ }
+
+ if (std::regex_match(line, match, rev_regex)) {
+ int regv = stoi(match[1], nullptr, 10);
+ midr |= (regv);
+ midr |= (0xf << 16);
+ continue;
+ }
+ }
+
+ if (curcpu >= 0) {
+ _percpu[curcpu].midr = midr;
+ _percpu[curcpu].model = midr_to_model(midr);
+ _percpu[curcpu].model_set = true;
+
+ printf("CPU %d: %x\n",curcpu,midr);
+ }
+ }
+ }
+
+ /* Identify the maximum valid CPUID in the system. This reads
+ * /sys/devices/system/cpu/present to get the information. */
+ int get_max_cpus() {
+ int max_cpus = 1;
+
+#ifndef BARE_METAL
+ std::ifstream CPUspresent;
+ CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
+ bool success = false;
+
+ if (CPUspresent.is_open()) {
+ std::string line;
+
+ if (bool(getline(CPUspresent, line))) {
+ /* The content of this file is a list of ranges or single values, e.g.
+ * 0-5, or 1-3,5,7 or similar. As we are interested in the
+ * max valid ID, we just need to find the last valid
+ * delimiter ('-' or ',') and parse the integer immediately after that.
+ */
+ auto startfrom=line.begin();
+
+ for (auto i=line.begin(); i<line.end(); ++i) {
+ if (*i=='-' || *i==',') {
+ startfrom=i+1;
+ }
+ }
+
+ line.erase(line.begin(), startfrom);
+
+ max_cpus = stoi(line, nullptr, 0) + 1;
+ success = true;
+ }
+ }
+
+ // Return std::thread::hardware_concurrency() as a fallback.
+ if (!success) {
+ max_cpus = std::thread::hardware_concurrency();
+ }
+#endif // !BARE_METAL
+
+ return max_cpus;
+ }
+
+public:
+ CPUInfo() {
+#ifndef BARE_METAL
+ unsigned long hwcaps = getauxval(AT_HWCAP);
+
+ if (hwcaps & HWCAP_CPUID) {
+ _cpuid = true;
+ }
+
+ if (hwcaps & HWCAP_ASIMDHP) {
+ _fp16 = true;
+ }
+
+ if (hwcaps & HWCAP_ASIMDDP) {
+ _dotprod = true;
+ }
+
+#ifdef __aarch64__
+ /* Pre-4.15 kernels don't have the ASIMDDP bit.
+ *
+ * Although the CPUID bit allows us to read the feature register
+ * directly, the kernel quite sensibly masks this to only show
+ * features known by it to be safe to show to userspace. As a
+ * result, pre-4.15 kernels won't show the relevant bit in the
+ * feature registers either.
+ *
+ * So for now, use a whitelist of CPUs known to support the feature.
+ */
+ if (!_dotprod && _cpuid) {
+ /* List of CPUs with dot product support: A55r1 A75r1 A75r2 */
+ const unsigned int dotprod_whitelist_masks[] = { 0xfff0fff0, 0xfff0fff0, 0xfff0fff0, 0 };
+ const unsigned int dotprod_whitelist_values[] = { 0x4110d050, 0x4110d0a0, 0x4120d0a0, 0 };
+
+ unsigned long cpuid;
+
+ __asm __volatile (
+ "mrs %0, midr_el1\n"
+ : "=r" (cpuid)
+ :
+ :
+ );
+
+ for (int i=0;dotprod_whitelist_values[i];i++) {
+ if ((cpuid & dotprod_whitelist_masks[i]) == dotprod_whitelist_values[i]) {
+ _dotprod = true;
+ break;
+ }
+ }
+ }
+#endif
+ _percpu.resize(get_max_cpus());
+#endif
+ if (_cpuid) {
+ populate_models_cpuid();
+ } else {
+ populate_models_cpuinfo();
+ }
+ }
+
+ void set_fp16(const bool fp16) {
+ _fp16 = fp16;
+ }
+
+ void set_dotprod(const bool dotprod) {
+ _dotprod = dotprod;
+ }
+
+ void set_cpu_model(unsigned long cpuid, CPUModel model) {
+ if (_percpu.size() > cpuid) {
+ _percpu[cpuid].model = model;
+ _percpu[cpuid].model_set = true;
+ }
+ }
+
+ bool has_fp16() const {
+ return _fp16;
+ }
+
+ bool has_dotprod() const {
+ return _dotprod;
+ }
+
+ CPUModel get_cpu_model(unsigned long cpuid) const {
+ if (cpuid < _percpu.size()) {
+ return _percpu[cpuid].model;
+ }
+
+ return CPUModel::GENERIC;
+ }
+
+ CPUModel get_cpu_model() const {
+#ifdef BARE_METAL
+ return get_cpu_model(0);
+#else
+ return get_cpu_model(sched_getcpu());
+#endif
+ }
+
+ unsigned int get_L1_cache_size() const {
+ return L1_cache_size;
+ }
+
+ void set_L1_cache_size(unsigned int size) {
+ L1_cache_size = size;
+ }
+
+ unsigned int get_L2_cache_size() const {
+ return L2_cache_size;
+ }
+
+ void set_L2_cache_size(unsigned int size) {
+ L2_cache_size = size;
+ }
+};
+
+CPUInfo *get_CPUInfo();
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp
deleted file mode 100644
index a7e17fa074..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 8x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4>::Transform(
- T* out, const T* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- // Redirect to a 16x uint16_t specialisation
- TransformImpl<16, 1, true, 2, 2>::Transform(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t * const>(in),
- stride*2, x0*2, xmax*2, k0, kmax
- );
-}
-
-// Generic 12x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2>::Transform(
- T* out, const T* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- // Redirect to a uint16_t specialisation
- Transform(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t * const>(in),
- stride, x0, xmax, k0, kmax
- );
-}
-
-// Specialised 16 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
- __asm volatile (
- "VLD1.32 {d0-d3}, [%[in0]]!\n"
- "VST1.32 {d0-d3}, [%[out]]\n"
- ASM_PREFETCH("[%[in0], #192]")
- : [in0] "+r" (in0),
- [out] "+r" (out)
- :
- : "q0", "q1", "memory"
- );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
- __asm volatile (
- "VLD1.32 {d0-d3}, [%[in0]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
- ASM_PREFETCH("[%[in0], #192]")
- "VLD1.32 {d0-d3}, [%[in1]]!\n"
- "VST1.32 {d0-d3}, [%[out]]\n"
- ASM_PREFETCH("[%[in1], #192]")
- "SUB %[out], %[out], #32\n"
- : [in0] "+r" (in0),
- [in1] "+r" (in1),
- [out] "+r" (out)
- :
- : "q0", "q1", "memory"
- );
-}
-
-template <>
-inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
- __asm __volatile (
- "VLD1.32 {d0-d3}, [%[in0]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
- ASM_PREFETCH("[%[in0], #192]")
- "VLD1.32 {d0-d3}, [%[in1]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
- ASM_PREFETCH("[%[in1], #192]")
- "VLD1.32 {d0-d3}, [%[in2]]!\n"
- "VST1.32 {d0-d3}, [%[out]]!\n"
- ASM_PREFETCH("[%[in2], #192]")
- "VLD1.32 {d0-d3}, [%[in3]]!\n"
- "VST1.32 {d0-d3}, [%[out]]\n"
- ASM_PREFETCH("[%[in3], #192]")
- "SUB %[out], %[out], #96\n"
- : [in0] "+r" (in0),
- [in1] "+r" (in1),
- [in2] "+r" (in2),
- [in3] "+r" (in3),
- [out] "+r" (out)
- :
- : "q0", "q1", "memory"
- );
-}
-
-template <>
-template <>
-inline void TransformImpl<16, 1, true, 2, 2>::Transform(
- uint16_t* out, const uint16_t* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp
deleted file mode 100644
index bdc05473b4..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint16_t *outptr = (uint16_t *)out;
- const uint16_t *inptr = (const uint16_t *)in;
-
- uint16_t zerobuff[24];
-
- for (int y=y0; y<ymax; y+=8) {
- const uint16_t *inptr0 = inptr + y * ldin + k0;
- const uint16_t *inptr1 = inptr0 + ldin;
- const uint16_t *inptr2 = inptr1 + ldin;
- const uint16_t *inptr3 = inptr2 + ldin;
- const uint16_t *inptr4 = inptr3 + ldin;
- const uint16_t *inptr5 = inptr4 + ldin;
- const uint16_t *inptr6 = inptr5 + ldin;
- const uint16_t *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;x>7;x-=8) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- /* Everything falls through in here */
- case 6:
- inptr1 = zerobuff;
- case 5:
- inptr2 = zerobuff;
- case 4:
- inptr3 = zerobuff;
- case 3:
- inptr4 = zerobuff;
- case 2:
- inptr5 = zerobuff;
- case 1:
- inptr6 = zerobuff;
- case 0:
- inptr7 = zerobuff;
- }
- }
-
- int skippf = (x & 31);
- __asm __volatile (
- // Load up 8 elements (1 vector) from each of 8 sources.
- "CBNZ %w[skippf], 1f\n"
- ASM_PREFETCH("[%[inptr0], #128]")
- ASM_PREFETCH("[%[inptr1], #128]")
- ASM_PREFETCH("[%[inptr2], #128]")
- ASM_PREFETCH("[%[inptr3], #128]")
- "1:\n"
-
- "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
- "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
- "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
- "LDR q6, [%[inptr6]], #16\n"
- "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
- "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
- "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
- "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
- "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
- "LDR q5, [%[inptr5]], #16\n"
- "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
- "LDR q7, [%[inptr7]], #16\n"
- "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
- "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
- "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
- "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
-
- "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
- "ZIP2 v20.8h, v8.8h, v9.8h\n"
- "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
- "ZIP2 v21.8h, v10.8h, v11.8h\n"
-
- "CBNZ %w[skippf], 2f\n"
- ASM_PREFETCH("[%[inptr4], #112]")
- ASM_PREFETCH("[%[inptr5], #112]")
- ASM_PREFETCH("[%[inptr6], #112]")
- ASM_PREFETCH("[%[inptr7], #112]")
- "2:\n"
-
- "ZIP1 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v30.8h, v16.8h, v17.8h\n"
- "ZIP1 v23.8h, v18.8h, v19.8h\n"
- "ZIP2 v31.8h, v18.8h, v19.8h\n"
-
- "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
- "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
- "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements
-
- "ZIP1 v0.8h, v20.8h, v21.8h\n"
- "ZIP2 v1.8h, v20.8h, v21.8h\n"
- "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements
-
- "ZIP1 v2.8h, v22.8h, v23.8h\n"
- "ZIP2 v3.8h, v22.8h, v23.8h\n"
- "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements
-
- "ZIP1 v4.8h, v30.8h, v31.8h\n"
- "ZIP2 v5.8h, v30.8h, v31.8h\n"
- "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- : [skippf] "r" (skippf)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
- "v25", "v26", "v27", "v28", "v29", "v30", "v31"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp
deleted file mode 100644
index 3c9e05223d..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_half_to_float.hpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-#include <arm_neon.h>
-#include "asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) {
- float *outptr = out;
- const __fp16 *inptr = in;
-
- __fp16 zerobuff[8];
-
- for (int y=y0; y<ymax; y+=8) {
- const __fp16 *inptr0 = inptr + y * ldin + k0;
- const __fp16 *inptr1 = inptr0 + ldin;
- const __fp16 *inptr2 = inptr1 + ldin;
- const __fp16 *inptr3 = inptr2 + ldin;
- const __fp16 *inptr4 = inptr3 + ldin;
- const __fp16 *inptr5 = inptr4 + ldin;
- const __fp16 *inptr6 = inptr5 + ldin;
- const __fp16 *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;x>7;x-=8) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- /* Everything falls through in here */
- case 6:
- inptr1 = zerobuff;
- case 5:
- inptr2 = zerobuff;
- case 4:
- inptr3 = zerobuff;
- case 3:
- inptr4 = zerobuff;
- case 2:
- inptr5 = zerobuff;
- case 1:
- inptr6 = zerobuff;
- case 0:
- inptr7 = zerobuff;
- default:
- break;
- }
- }
-
- __asm __volatile (
- // Load up 8 elements (2 vectors) from each of 8 sources.
- "LDR q0, [%[inptr0]], #16\n"
- "LDR q2, [%[inptr1]], #16\n"
- "FCVTL2 v1.4s, v0.8h\n"
- "FCVTL v0.4s, v0.4h\n"
- "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
- "FCVTL2 v3.4s, v2.8h\n"
- "FCVTL v2.4s, v2.4h\n"
- "FCVTL2 v5.4s, v4.8h\n"
- "FCVTL v4.4s, v4.4h\n"
- "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
- ASM_PREFETCH("[%[inptr0], #128]")
- "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
- "FCVTL2 v7.4s, v6.8h\n"
- "FCVTL v6.4s, v6.4h\n"
- "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
- "LDR q8, [%[inptr4]], #16\n"
- "LDR q10, [%[inptr5]], #16\n"
- "FCVTL2 v9.4s, v8.8h\n"
- "FCVTL v8.4s, v8.4h\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "LDR q12, [%[inptr6]], #16\n"
- "FCVTL2 v11.4s, v10.8h\n"
- "FCVTL v10.4s, v10.4h\n"
- "FCVTL2 v13.4s, v12.8h\n"
- "FCVTL v12.4s, v12.4h\n"
- "ZIP1 v18.4s, v8.4s, v12.4s\n"
- "LDR q14, [%[inptr7]], #16\n"
- "FCVTL2 v15.4s, v14.8h\n"
- "FCVTL v14.4s, v14.4h\n"
- "ZIP1 v19.4s, v10.4s, v14.4s\n"
-
- ASM_PREFETCH("[%[inptr2], #128]")
- "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
-
- "ZIP2 v16.4s, v0.4s, v4.4s\n"
- "ZIP2 v17.4s, v2.4s, v6.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
- "ZIP2 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "ZIP2 v19.4s, v10.4s, v14.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP1 v16.4s, v1.4s, v5.4s\n"
- "ZIP1 v17.4s, v3.4s, v7.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
- "STP q20, q21, [%[outptr]], #32\n" // Third element
-
- "ZIP1 v18.4s, v9.4s, v13.4s\n"
- "ZIP1 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Fourth element
- ASM_PREFETCH("[%[inptr7], #128]")
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP2 v16.4s, v1.4s, v5.4s\n"
- "ZIP2 v17.4s, v3.4s, v7.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Fifth element
-
- "ZIP2 v18.4s, v9.4s, v13.4s\n"
- "ZIP2 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Sixth element
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Seventh element
-
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp
deleted file mode 100644
index 6e07064a0c..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include "transpose_interleave_common.hpp"
-
-// Generic unblocked transposed 6x32-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4>::Transform(
- T* out, const T* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- // Redirect to a 12 x uint16_t specialisation
- TransformImpl<12, 1, true, 2, 2>::Transform(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t * const>(in),
- stride*2, x0*2, xmax*2, k0, kmax
- );
-}
-
-// Generic 12x16-bit sized specialisation
-template <>
-template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2>::Transform(
- T* out, const T* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- // Redirect to a uint16_t specialisation
- Transform(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t * const>(in),
- stride, x0, xmax, k0, kmax
- );
-}
-
-// Specialised 12 x uint16_t version
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
- __asm volatile (
- "LDR q0, [%[in0]]\n"
- "STR q0, [%[out]]\n"
- "LDR d1, [%[in0], #0x10]\n"
- "STR d1, [%[out], #0x10]\n"
- "ADD %x[in0], %x[in0], #0x18\n"
- ASM_PREFETCH("[%[in0], #192]")
- : [in0] "+r" (in0),
- [out] "+r" (out)
- :
- : "v0", "v1", "memory"
- );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out) {
- __asm volatile (
- "LDR q0, [%[in0]]\n"
- "LDR d1, [%[in0], #0x10]\n"
- "ADD %x[in0], %x[in0], #0x18\n"
- ASM_PREFETCH("[%[in0], #192]")
-
- "LDR x21, [%[in1]]\n"
- "LDR q2, [%[in1], #0x08]\n"
- "INS v1.d[1], x21\n"
- "ADD %x[in1], %x[in1], #0x18\n"
- "STP q0, q1, [%[out]]\n"
- "STR q2, [%x[out], #0x20]\n"
- ASM_PREFETCH("[%[in1], #192]")
- : [in0] "+r" (in0),
- [in1] "+r" (in1),
- [out] "+r" (out)
- :
- : "x21", "v0", "v1", "v2", "memory"
- );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
- __asm __volatile (
- "LDR q0, [%x[in0]], #0x10\n"
- "STR q0, [%x[out]]\n"
- "LDR d1, [%x[in0]], #0x08\n"
- ASM_PREFETCH("[%[in0], #192]")
- "STR d1, [%x[out], #0x10]\n"
-
- "LDR q0, [%x[in1]], #0x10\n"
- "STR q0, [%x[out], #0x18]\n"
- "LDR d1, [%x[in1]], #0x08\n"
- ASM_PREFETCH("[%[in1], #192]")
- "STR d1, [%x[out], #0x28]\n"
-
- "LDR q0, [%x[in2]], #0x10\n"
- "STR q0, [%x[out], #0x30]\n"
- "LDR d1, [%x[in2]], #0x08\n"
- ASM_PREFETCH("[%[in2], #192]")
- "STR d1, [%x[out], #0x40]\n"
-
- "LDR q0, [%x[in3]], #0x10\n"
- "STR q0, [%x[out], #0x48]\n"
- "LDR d1, [%x[in3]], #0x08\n"
- ASM_PREFETCH("[%[in3], #192]")
- "STR d1, [%x[out], #0x58]\n"
- : [in0] "+r" (in0),
- [in1] "+r" (in1),
- [in2] "+r" (in2),
- [in3] "+r" (in3),
- [out] "+r" (out)
- :
- : "v0", "v1", "memory"
- );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 2, 2>::Transform(
- uint16_t* out, const uint16_t* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp
deleted file mode 100644
index 835e4d87aa..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined( __aarch64__) && defined( __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
-#include "transpose_interleave_common.hpp"
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out) {
- __asm __volatile (
- "LDR q0, [%[in0]], #16\n"
- "FCVTL2 v1.4s, v0.8h\n"
- "FCVTL v0.4s, v0.4h\n"
- "STP q0, q1, [%[out]]\n"
- ASM_PREFETCH("[%[in0], #192]")
- "LDR d2, [%[in0]], #8\n"
- "FCVTL v2.4s, v2.4h\n"
- "STR q2, [%[out], #32]\n"
- : [in0] "+r" (in0), [out] "+r" (out)
- :
- : "v0", "v1", "v2", "memory"
- );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out) {
- __asm __volatile (
- "LDR q0, [%[in0]], #16\n"
- "FCVTL2 v1.4s, v0.8h\n"
- "FCVTL v0.4s, v0.4h\n"
- "STP q0, q1, [%[out]]\n"
- ASM_PREFETCH("[%[in0], #192]")
- "LDR d2, [%[in0]], #8\n"
- "FCVTL v2.4s, v2.4h\n"
- "LDR q3, [%[in1]], #16\n"
- "FCVTL2 v4.4s, v3.8h\n"
- "FCVTL v3.4s, v3.4h\n"
- "STP q2, q3, [%[out], #32]\n"
- ASM_PREFETCH("[%[in1], #192]")
- "LDR d5, [%[in1]], #16\n"
- "FCVTL v5.4s, v5.4h\n"
- "STP q4, q5, [%[out], #64]\n"
- : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
- );
-}
-
-template <>
-inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out) {
- __asm __volatile (
- "LDR q0, [%[in0]], #16\n"
- "FCVTL2 v1.4s, v0.8h\n"
- "FCVTL v0.4s, v0.4h\n"
- "STP q0, q1, [%[out]]\n"
- "LDR d2, [%[in0]], #8\n"
- ASM_PREFETCH("[%[in0], #192]")
- "FCVTL v2.4s, v2.4h\n"
- "LDR q3, [%[in1]], #16\n"
- "FCVTL2 v4.4s, v3.8h\n"
- "FCVTL v3.4s, v3.4h\n"
- "STP q2, q3, [%[out], #32]\n"
- "LDR d5, [%[in1]], #8\n"
- "FCVTL v5.4s, v5.4h\n"
- ASM_PREFETCH("[%[in1], #192]")
- "STP q4, q5, [%[out], #64]\n"
- "LDR q6, [%[in2]], #16\n"
- "FCVTL2 v7.4s, v6.8h\n"
- "FCVTL v6.4s, v6.4h\n"
- "STP q6, q7, [%[out], #96]\n"
- "LDR d8, [%[in2]], #8\n"
- "FCVTL v8.4s, v8.4h\n"
- ASM_PREFETCH("[%[in2], #192]")
- "LDR q9, [%[in3]], #16\n"
- "FCVTL2 v10.4s, v9.8h\n"
- "FCVTL v9.4s, v9.4h\n"
- "STP q8, q9, [%[out], #128]\n"
- "LDR d11, [%[in3]], #8\n"
- "FCVTL v11.4s, v11.4h\n"
- "STP q10, q11, [%[out], #160]\n"
- ASM_PREFETCH("[%[in3], #192]")
-
- : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
- );
-}
-
-template <>
-template <>
-inline void TransformImpl<12, 1, true, 4, 2>::Transform(
- float* out, const __fp16* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
-}
-
-#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
deleted file mode 100644
index 231b3f181e..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/transforms/transpose_interleave_common.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-template <unsigned int IntBy, typename TIn, typename TOut>
-struct TransposeInterleaveCommon {
- // Override the moveblock_1xY methods to improve performance
- static inline void moveblock_1x1(const TIn *&in0, TOut *out) {
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in0++);
- }
- }
-
- static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) {
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in0++);
- }
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in1++);
- }
- }
-
- static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) {
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in0++);
- }
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in1++);
- }
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in2++);
- }
- for (unsigned int i = 0; i < IntBy; i++) {
- *out++ = static_cast<TOut>(*in3++);
- }
- }
-
- static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) {
- const auto ldin = stride;
-
- TOut *outarray = out;
- const TIn *inarray = in;
- TOut *outptr_base = outarray;
- const TIn *inptr_base = inarray + x0 + (k0 * ldin);
- int ldout = (kmax - k0) * IntBy;
-
- int k=(kmax-k0);
- for ( ; k>3; k-=4) {
- TOut *outptr = outptr_base;
- const TIn *inptr = inptr_base;
- const TIn *inptr1 = inptr + ldin;
- const TIn *inptr2 = inptr1 + ldin;
- const TIn *inptr3 = inptr2 + ldin;
-
- prefetch_3x(inptr);
- prefetch_3x(inptr1);
- prefetch_3x(inptr2);
- prefetch_3x(inptr3);
-
- outptr_base += IntBy * 4;
- inptr_base += ldin * 4;
-
- for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
- moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
- outptr += ldout;
- }
- }
-
- if (k) {
- TOut *outptr = outptr_base;
- const TIn *inptr = inptr_base;
- const TIn *inptr1 = inptr + ldin;
- const TIn *inptr2 = inptr1 + ldin;
-
- prefetch_3x(inptr);
- prefetch_3x(inptr1);
- prefetch_3x(inptr2);
-
- for (int x = (xmax-x0) / IntBy; x > 0 ; x--) {
- switch(k) {
- case 3:
- moveblock_1x2(inptr, inptr1, outptr);
- moveblock_1x1(inptr2, outptr + IntBy * 2);
- break;
-
- case 2:
- moveblock_1x2(inptr, inptr1, outptr);
- break;
-
- case 1:
- moveblock_1x1(inptr, outptr);
- break;
- default:
- break;
- }
-
- outptr += ldout;
- }
- }
-
- // Cope with ragged X cases
- const unsigned int overflow = (xmax - x0) % IntBy;
- if (overflow) {
- const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
- TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
-
- for (int k=(kmax-k0); k>0; k--) {
- const TIn *inptr = inptr_base;
- inptr_base += ldin;
-
- for (unsigned int x=0; x < IntBy; x++) {
- TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
- *outptr++ = val;
- }
- }
- }
-}
-};
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
new file mode 100644
index 0000000000..2b304b8022
--- /dev/null
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_ASSEMBLY_HELPER_H__
+#define __ARM_ASSEMBLY_HELPER_H__
+
+#include "arm_compute/core/ITensor.h"
+#include "support/ToolchainSupport.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapper.h"
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+template <typename TypeInput, typename TypeOutput>
+class AssemblyKernelGlue final
+{
+public:
+ using TypeOperator = TypeInput;
+ using TypeResult = TypeOutput;
+ AssemblyKernelGlue()
+ : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr)
+ {
+ }
+ using AssemblyGemm = arm_gemm::GemmCommon<TypeInput, TypeOutput>;
+
+ const AssemblyKernelGlue<TypeInput, TypeOutput> &operator=(const AssemblyKernelGlue<TypeInput, TypeOutput> &) = delete;
+ AssemblyKernelGlue(const AssemblyKernelGlue<TypeInput, TypeOutput> &) = delete;
+
+ std::unique_ptr<AssemblyGemm> _gemm_kernel_asm;
+ std::unique_ptr<INEKernel> _optimised_kernel;
+ const ITensor *_a;
+ const ITensor *_b;
+ ITensor *_d;
+
+ /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel.
+ * The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2)
+ */
+ inline void run()
+ {
+ const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
+
+ // Configure kernel window
+ Window window = calculate_max_window(*_d->info());
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
+
+ // Only iterate over batches
+ Window win(window);
+ win.set(0, Window::Dimension(0, 1, 1));
+ win.set(1, Window::Dimension(0, 1, 1));
+ Iterator in0(_a, window);
+ Iterator out(_d, window);
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ const auto in0_ptr = reinterpret_cast<const TypeInput *>(in0.ptr());
+ auto out_ptr = reinterpret_cast<TypeOutput *>(out.ptr());
+ _gemm_kernel_asm->set_arrays(in0_ptr, lda, in1_ptr, ldb, out_ptr, ldd);
+ NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
+ },
+ in0, out);
+ }
+};
+
+using AssemblyKernelGlueF32 = AssemblyKernelGlue<float, float>;
+using AssemblyKernelGlueU8U32 = AssemblyKernelGlue<uint8_t, uint32_t>;
+using AssemblyKernelGlueS8S32 = AssemblyKernelGlue<int8_t, int32_t>;
+
+inline void allocate_workspace(size_t workspace_size, Tensor &workspace, MemoryGroup &memory_group, size_t alignment, unsigned int num_threads)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
+ workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment - 1) * num_threads }, 1, DataType::S8));
+ workspace.allocator()->allocate();
+}
+
+template <typename T>
+std::unique_ptr<NEGEMMAssemblyWrapper<T>> create_wrapper_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+{
+ // rework this function, why are we checking data type and other things here ? should we create another function can_run_optimised_kernel() ?
+#if defined(__arm__)
+ if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+ {
+ return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
+ }
+#elif defined(__aarch64__)
+ if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
+ {
+ return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
+ }
+ else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
+ {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ return support::cpp14::make_unique<NEGEMMAssemblyWrapper<T>>();
+#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+ }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+ return nullptr;
+}
+
+template <typename T>
+inline bool setup_assembly_kernel(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta,
+ Tensor &workspace, MemoryGroup &memory_group, T &asm_glue)
+{
+ const ::CPUInfo *ci = get_CPUInfo();
+ const int M = d->info()->tensor_shape().y();
+ const int N = d->info()->tensor_shape().x();
+ const int K = a->info()->tensor_shape().x();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+ // unique_ptr to a Gemm object
+ std::unique_ptr<typename T::AssemblyGemm> asm_gemm(arm_gemm::gemm<typename T::TypeOperator, typename T::TypeResult>(*ci, M, N, K, false, false, alpha, beta, num_threads,
+ false));
+
+ // arm_compute wrapper for the Gemm object (see above)
+ std::unique_ptr<NEGEMMAssemblyWrapper<typename T::AssemblyGemm>> acl_gemm_wrapper = create_wrapper_kernel<typename T::AssemblyGemm>(a, b, c, d, alpha, beta);
+ if(acl_gemm_wrapper != nullptr && asm_gemm != nullptr)
+ {
+ acl_gemm_wrapper->configure(asm_gemm.get());
+ const size_t workspace_size = asm_gemm->get_working_size();
+ if(workspace_size)
+ {
+ // Allocate workspace
+ allocate_workspace(workspace_size, workspace, memory_group, 4096, num_threads);
+ asm_gemm->set_working_space(reinterpret_cast<typename T::TypeResult *>(workspace.buffer()));
+ }
+ const unsigned int window_size = asm_gemm->get_window_size();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ asm_gemm->set_nthreads(num_threads);
+ }
+ asm_glue._gemm_kernel_asm = std::move(asm_gemm);
+ asm_glue._optimised_kernel = std::move(acl_gemm_wrapper);
+ // We need to setup the ptrs in the run() method
+ asm_glue._a = a;
+ asm_glue._b = b;
+ asm_glue._d = d;
+ return true;
+ }
+ return false;
+}
+}
+#endif /* __ARM_ASSEMBLY_HELPER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index f2b6ef77bd..5279995be4 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -25,7 +25,6 @@
#define __ARM_COMPUTE_NEGEMM_H__
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
@@ -35,6 +34,8 @@
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
+
#include <memory>
namespace arm_compute
@@ -73,19 +74,19 @@ public:
void run() override;
private:
- MemoryGroup _memory_group;
- NEGEMMInterleave4x4Kernel _interleave_kernel;
- NEGEMMTranspose1xWKernel _transpose_kernel;
- NEGEMMMatrixMultiplyKernel _mm_kernel;
- std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
- NEGEMMMatrixAdditionKernel _ma_kernel;
- Tensor _tmp_a;
- Tensor _tmp_b;
- Tensor _workspace;
- bool _run_vector_matrix_multiplication;
- bool _run_addition;
- bool _is_first_run;
- bool _reshape_b_only_on_first_run;
+ MemoryGroup _memory_group;
+ NEGEMMInterleave4x4Kernel _interleave_kernel;
+ NEGEMMTranspose1xWKernel _transpose_kernel;
+ NEGEMMMatrixMultiplyKernel _mm_kernel;
+ AssemblyKernelGlueF32 _asm_glue;
+ NEGEMMMatrixAdditionKernel _ma_kernel;
+ Tensor _tmp_a;
+ Tensor _tmp_b;
+ Tensor _workspace;
+ bool _run_vector_matrix_multiplication;
+ bool _run_addition;
+ bool _is_first_run;
+ bool _reshape_b_only_on_first_run;
};
}
#endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index ac5f4caa78..4ae8ee1fb3 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -36,6 +36,7 @@
#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
#include "arm_compute/runtime/Tensor.h"
@@ -149,22 +150,14 @@ private:
* @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
*/
void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
- /** Prepare the appropriate assembly optimized kernel
- *
- * @param[in] ci CPU information
- * @param[in] M M parameter of matrix multiplication
- * @param[in] N N parameter of matrix multiplication
- * @param[in] K K parameter of matrix multiplication
- */
- void configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K);
private:
+ AssemblyKernelGlueF32 _asm_glue;
MemoryGroup _memory_group;
NEIm2ColKernel _input_im2col_kernel;
NEGEMMInterleave4x4Kernel _input_interleave_kernel;
NEConvolutionLayerReshapeWeights _reshape_weights;
NEGEMMMatrixMultiplyKernel _mm_kernel;
- std::unique_ptr<NEGEMMAssemblyBaseKernel> _mm_optimised_kernel;
NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
NECol2ImKernel _output_col2im_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
index 3d213a7668..f09c94e726 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
@@ -1,6 +1,6 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,7 @@
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
@@ -58,6 +59,8 @@ public:
private:
MemoryGroup _memory_group;
+ AssemblyKernelGlueU8U32 _asm_glue_unsigned;
+ AssemblyKernelGlueS8S32 _asm_glue_signed;
std::unique_ptr<INEKernel> _mm_kernel;
std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index eddb3a26b7..95776f829a 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,7 @@
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/Tensor.h"
#include <memory>
@@ -48,7 +49,6 @@ class ITensor;
* otherwise if the DOT product instruction is available:
*
* -# @ref NEGEMMInterleaveBlockedKernel
- * -# @ref NEGEMMLowpAArch64V8P4Kernel
* -# @ref NEGEMMLowpOffsetContributionKernel
*
*/
@@ -90,6 +90,8 @@ public:
private:
MemoryGroup _memory_group;
+ AssemblyKernelGlueU8U32 _asm_glue_unsigned;
+ AssemblyKernelGlueS8S32 _asm_glue_signed;
std::unique_ptr<INEKernel> _mm_kernel;
std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index eb6130bda5..555cec5c35 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -195,6 +195,12 @@ If there is more than one release in a month then an extra sequential number is
@subsection S2_2_changelog Changelog
+v18.05 Public maintenance release
+ - Major redesign in the interface for the neon kernels implemented in assembly.
+ - Removed arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore / arm_compute::NEHGEMMAArch64FP16Kernel
+ - Added NEGEMMAssemblyWrapper and AssemblyKernelGlue which are used to execute assembly kernels in neon functions.
+ - Minor changes to the CPUInfo type to make it compatible with the new assembly gemm interface.
+
v18.03 Public maintenance release
- Various bug fixes.
- Fixed bug in @ref NEActivationLayer
@@ -301,8 +307,8 @@ v17.12 Public major release
- @ref GCTransposeKernel / @ref GCTranspose
- New NEON kernels / functions
- - @ref NEGEMMLowpAArch64A53Kernel / @ref NEGEMMLowpAArch64Kernel / @ref NEGEMMLowpAArch64V8P4Kernel / NEGEMMInterleavedBlockedKernel / @ref NEGEMMLowpAssemblyMatrixMultiplyCore
- - @ref NEHGEMMAArch64FP16Kernel
+ - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
+ - arm_compute::NEHGEMMAArch64FP16Kernel
- @ref NEDepthwiseConvolutionLayer3x3Kernel / @ref NEDepthwiseIm2ColKernel / @ref NEGEMMMatrixVectorMultiplyKernel / @ref NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
- @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore
- @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
@@ -340,7 +346,7 @@ v17.09 Public major release
- New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework).
- Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL.
- New NEON kernels / functions:
- - @ref NEGEMMAssemblyBaseKernel @ref NEGEMMAArch64Kernel
+ - arm_compute::NEGEMMAssemblyBaseKernel arm_compute::NEGEMMAArch64Kernel
- @ref NEDequantizationLayerKernel / @ref NEDequantizationLayer
- @ref NEFloorKernel / @ref NEFloor
- @ref NEL2NormalizeLayerKernel / @ref NEL2NormalizeLayer
diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index d9f6156fb2..6f76b5e2e7 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp
@@ -21,6 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+#include "arm_compute/core/Error.h"
#include "arm_compute/graph2.h"
#include "support/ToolchainSupport.h"
#include "utils/GraphUtils.h"
@@ -43,6 +44,9 @@ class InceptionV4Example final : public Example
public:
void do_setup(int argc, char **argv) override
{
+ // Disabled the test for now because the process gets killed on Linux Firefly 32 bit even when using ConvolutionMethodHint::DIRECT.
+ // Needs to review/rework to run the code below.
+#if __aarch64__
std::string data_path; /* Path to the trainable data */
std::string image; /* Image data */
std::string label; /* Label data */
@@ -90,7 +94,6 @@ public:
graph << target_hint << InputLayer(TensorDescriptor(TensorShape(299U, 299U, 3U, 1U), DataType::F32),
get_input_accessor(image, std::move(preprocessor), false))
-
// Conv2d_1a_3x3
<< ConvolutionLayer(3U, 3U, 32U,
get_weights_accessor(data_path, "/cnn_data/inceptionv4_model/Conv2d_1a_3x3_weights.npy"),
@@ -157,11 +160,18 @@ public:
// Finalize graph
graph.finalize(target_hint, enable_tuning, enable_memory_management);
+#else /* __aarch64__ */
+ using namespace arm_compute;
+ ARM_COMPUTE_UNUSED(argc);
+ ARM_COMPUTE_UNUSED(argv);
+#endif /* __aarch64__ */
}
void do_run() override
{
+#if __aarch64__
graph.run();
+#endif /* __aarch64__ */
}
private:
diff --git a/scripts/check_bad_style.sh b/scripts/check_bad_style.sh
index e5dc15c218..292cf518cd 100755
--- a/scripts/check_bad_style.sh
+++ b/scripts/check_bad_style.sh
@@ -4,7 +4,7 @@ set -e
DIRECTORIES="./arm_compute ./src ./examples ./tests ./utils ./support"
-grep -HrnP --exclude-dir=assembly --exclude-dir=convolution "/\*\*$" $DIRECTORIES | tee bad_style.log
+grep -HrnP --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "/\*\*$" $DIRECTORIES | tee bad_style.log
if (( `cat bad_style.log | wc -l` > 0 ))
then
echo ""
@@ -12,7 +12,7 @@ then
exit -1
fi
-grep -Hnr --exclude-dir=assembly --exclude-dir=convolution --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
+grep -Hnr --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm --exclude=Doxyfile "@brief" $DIRECTORIES | tee bad_style.log
if (( `cat bad_style.log | wc -l` > 0 ))
then
echo ""
@@ -20,7 +20,7 @@ then
exit -1
fi
-grep -HnRE --exclude-dir=assembly --exclude-dir=convolution "\buint " --exclude-dir=cl_kernels --exclude-dir=cs_shaders $DIRECTORIES | tee bad_style.log
+grep -HnRE --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "\buint " --exclude-dir=cl_kernels --exclude-dir=cs_shaders $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -28,7 +28,7 @@ then
exit -1
fi
-grep -HnR --exclude-dir=assembly --exclude-dir=convolution "float32_t" $DIRECTORIES | tee bad_style.log
+grep -HnR --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "float32_t" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -36,7 +36,7 @@ then
exit -1
fi
-grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "arm[_ ]\?cv" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -44,7 +44,7 @@ then
exit -1
fi
-grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "#.*if.*defined[^(]" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "#.*if.*defined[^(]" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -52,7 +52,7 @@ then
exit -1
fi
-grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "#else$\|#endif$" $DIRECTORIES | tee bad_style.log
+grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "#else$\|#endif$" $DIRECTORIES | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
@@ -60,7 +60,7 @@ then
exit -1
fi
-grep -Hnir --exclude-dir=assembly --exclude-dir=convolution "ARM_COMPUTE_AARCH64_V8_2" ./tests/validation/CL | tee bad_style.log
+grep -Hnir --exclude-dir=assembly --exclude-dir=convolution --exclude-dir=arm_gemm "ARM_COMPUTE_AARCH64_V8_2" ./tests/validation/CL | tee bad_style.log
if [[ $(cat bad_style.log | wc -l) > 0 ]]
then
echo ""
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index d6deee9b68..e5793e5061 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -40,6 +40,9 @@ def filter_clang_tidy_lines( lines ):
if "/assembly/" in line:
continue
+ if "/arm_gemm/" in line:
+ continue
+
if "/convolution/" in line:
continue
@@ -90,6 +93,8 @@ def filter_clang_tidy_lines( lines ):
("parameter 'memory_manager' is copied for each invocation but only used as a const reference" in line) or
("DeconvolutionLayer.cpp" in line and "casting (double + 0.5) to integer leads to incorrect rounding; consider using lround" in line) or
("NEWinogradLayerKernel.cpp" in line and "use '= default' to define a trivial destructor" in line) or
+ ("NEGEMMLowpMatrixMultiplyCore.cpp" in line and "constructor does not initialize these fields" in line) or
+ ("NEGEMMLowpAssemblyMatrixMultiplyCore" in line and "constructor does not initialize these fields" in line) or
"3rdparty" in line):
print_context=False
continue
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index d4ce3888fd..5d1464ace4 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -152,9 +152,9 @@ GCKernel::GCKernel(std::string name, GLuint program)
ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name));
_shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
- ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name);
+ ARM_COMPUTE_ERROR_ON_MSG(_shader_params_index == GL_INVALID_INDEX, "Failed to get index of %s", _shader_params_name);
ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size));
- ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name);
+ ARM_COMPUTE_ERROR_ON_MSG(_shader_params_size == 0, "Failed to get size of %s", _shader_params_name);
}
void GCKernel::cleanup()
diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
deleted file mode 100644
index bffcbbf436..0000000000
--- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 8, 6);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 6);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 8);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
- const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
- reinterpret_cast<const float *>(in1_ptr), ldb,
- reinterpret_cast<float *>(out.ptr()), ldc,
- _alpha, _beta, workspace);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
deleted file mode 100644
index 0eaa9aa39b..0000000000
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
- const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const float *>(in0.ptr()), lda,
- reinterpret_cast<const float *>(in1_ptr), ldb,
- reinterpret_cast<float *>(out.ptr()), ldc,
- _alpha, _beta, workspace);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
deleted file mode 100644
index 0b3212bf55..0000000000
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/convolution/winograd/gemm.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMMAArch64NativeKernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(16U, 4U));
-
- const int input0_access_end_x = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
- const int input0_access_end_y = ceil_to_multiple(input0->info()->tensor_shape().y(), 4);
- const int input1_access_end_x = ceil_to_multiple(input1->info()->tensor_shape().x(), 16);
-
- AccessWindowStatic input0_access(input0->info(), 0, 0, input0_access_end_x, input0_access_end_y);
- AccessWindowStatic input1_access(input1->info(), 0, 0, input1_access_end_x, input1->info()->tensor_shape().y());
- AccessWindowRectangle output_access(output->info(), 0, 0, 16, 4);
- update_window_and_padding(win, input0_access, input1_access, output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMAArch64NativeKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_UNUSED(info);
-
- const auto in1_ptr = reinterpret_cast<const float *>(_input1->buffer());
-
- // Calculate row strides for each matrix
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(float);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(float);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(float);
-
- // Calculate matrix sizes
- const int M = std::min(_input0->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int K = _input0->info()->tensor_shape().x();
- const int N = _input1->info()->tensor_shape().x();
-
- // Create window (Only iterate over batches)
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- // Create Iterators
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- // Execute GEMM
- execute_window_loop(win, [&](const Coordinates & id)
- {
- BlockedGemm<4, 16, float, float>(reinterpret_cast<const float *>(in0.ptr()),
- reinterpret_cast<const float *>(in1_ptr),
- reinterpret_cast<float *>(out.ptr()),
- M, K, N,
- lda, ldb, ldc);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
deleted file mode 100644
index 80606dcc07..0000000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel()
- : _func(nullptr)
-{
-}
-
-void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
- const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
- const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
- reinterpret_cast<const int8_t *>(in1_ptr), ldb,
- reinterpret_cast<int32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
- const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
- const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
- reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
- reinterpret_cast<uint32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- switch(input0->info()->data_type())
- {
- case DataType::S8:
- _func = &gemm_interleaved_s16_12x8;
- break;
- case DataType::U8:
- _func = &gemm_interleaved_u16_12x8;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
deleted file mode 100644
index 38f82f0407..0000000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8A targets
-#ifdef ARM_COMPUTE_AARCH64_V8A
-
-namespace arm_compute
-{
-NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel()
- : _func(nullptr)
-{
-}
-
-void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
-
- const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
- reinterpret_cast<const int8_t *>(in1_ptr), ldb,
- reinterpret_cast<int32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
- const ThreadInfo &info)
-{
- const int lda = input0->info()->strides_in_bytes().y();
- const int ldb = input1->info()->strides_in_bytes().y();
- const int ldc = output->info()->strides_in_bytes().y() / sizeof(uint32_t);
-
- const auto in1_ptr = reinterpret_cast<const uint8_t *>(input1->buffer());
-
- const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = output->info()->tensor_shape().x();
- const int K = input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(input0, window);
- Iterator out(output, window);
-
- GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
-
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *_workspace = workspace->buffer() + offset;
- size_t workspace_size = workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
- reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
- reinterpret_cast<uint32_t *>(out.ptr()), ldc,
- alpha, beta, _workspace);
- },
- in0, out);
-}
-
-void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- switch(input0->info()->data_type())
- {
- case DataType::S8:
- _func = &gemm_interleaved_s8;
- break;
- case DataType::U8:
- _func = &gemm_interleaved_u8;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 4, 4);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 4);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8A */
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
deleted file mode 100644
index d4fcf5e3cb..0000000000
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-// Enable only if compiled for AArch64-V8.2-A targets
-#ifdef ARM_COMPUTE_AARCH64_V8_2
-
-namespace
-{
-using namespace arm_compute;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::U8, DataType::S8);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
- // Configure kernel window
- Window win = calculate_max_window(*output);
-
- AccessWindowRectangle output_access(output, 0, 0, 12, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12);
-
- bool window_changed = update_window_and_padding(win,
- AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()),
- AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()),
- output_access);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-template <typename strategy, typename To, typename Tr>
-void *align_workspace(GemmInterleaved<strategy, To, Tr> &gemm, const ThreadInfo &info, ITensor *ws)
-{
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = ws->buffer() + offset;
- size_t workspace_size = ws->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
- return workspace;
-}
-
-template <typename strategy>
-void execute_gemm(const Window &win, Iterator &in0, Iterator &in1, Iterator &out,
- const ThreadInfo &info, ITensor *ws, int M, int N, int K, bool is_transposed_0, bool is_transposed_1,
- int lda, int ldb, int ldc, float alpha, float beta)
-{
- ARM_COMPUTE_UNUSED(M);
- ARM_COMPUTE_UNUSED(N);
- ARM_COMPUTE_UNUSED(K);
- ARM_COMPUTE_UNUSED(is_transposed_0);
- ARM_COMPUTE_UNUSED(is_transposed_1);
- GemmInterleaved<strategy, typename strategy::operand_type, typename strategy::result_type> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
- void *workspace = align_workspace(gemm, info, ws);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const typename strategy::operand_type *>(in0.ptr()), lda,
- reinterpret_cast<const typename strategy::operand_type *>(in1.ptr()), ldb,
- reinterpret_cast<typename strategy::result_type *>(out.ptr()), ldc,
- alpha, beta, workspace);
- },
- in0, out);
-}
-} // namespace
-
-namespace arm_compute
-{
-void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- // Perform validate step
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
-
- return Status{};
-}
-
-void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y();
- const int ldb = _input1->info()->strides_in_bytes().y();
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(uint32_t);
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator in1(_input1, window);
- Iterator out(_output, window);
-
- switch(_input0->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::U8:
- {
- execute_gemm<gemm_u8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
- break;
- }
- case DataType::S8:
- {
- execute_gemm<gemm_s8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported.");
- break;
- }
- }
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
diff --git a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
deleted file mode 100644
index 163014b04f..0000000000
--- a/src/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-void NEGEMVAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 12, 1);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEGEMVAArch64Kernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(sgemv_trans::operand_type);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(sgemv_trans::result_type);
-
- const auto in1_ptr = reinterpret_cast<const sgemv_trans::operand_type *>(_input1->buffer());
-
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type> gemm(&info.cpu_info, N, K);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const sgemv_trans::operand_type *>(in0.ptr()), lda,
- reinterpret_cast<const sgemv_trans::operand_type *>(in1_ptr), ldb,
- reinterpret_cast<sgemv_trans::result_type *>(out.ptr()), ldc,
- _alpha, _beta, workspace);
- },
- in0, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
deleted file mode 100644
index e84409cfd2..0000000000
--- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
-namespace arm_compute
-{
-void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
- bool is_transposed_1)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _is_transposed_0 = is_transposed_0;
- _is_transposed_1 = is_transposed_1;
-
- // Configure kernel window
- Window win = calculate_max_window(*output->info());
-
- AccessWindowRectangle output_access(output->info(), 0, 0, 24, 8);
-
- const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
- const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 24);
-
- update_window_and_padding(win,
- AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
- AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
- output_access);
-
- INEKernel::configure(win);
-}
-
-void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info)
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const int lda = _input0->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
- const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
- const int ldc = _output->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::result_type);
-
- const auto in1_ptr = reinterpret_cast<const hgemm_24x8::operand_type *>(_input1->buffer());
-
- const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
- const int N = _output->info()->tensor_shape().x();
- const int K = _input0->info()->tensor_shape().x();
-
- // Only iterate over batches
- Window win(window);
- win.set(0, Window::Dimension(0, 1, 1));
- win.set(1, Window::Dimension(0, 1, 1));
-
- Iterator in0(_input0, window);
- Iterator out(_output, window);
-
- GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
- constexpr size_t alignment = 4096;
- const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
- void *workspace = _workspace->buffer() + offset;
- size_t workspace_size = _workspace->info()->total_size();
-
- if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
- {
- ARM_COMPUTE_ERROR("Not enough space to align buffer!");
- }
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- gemm.execute(reinterpret_cast<const hgemm_24x8::operand_type *>(in0.ptr()), lda,
- reinterpret_cast<const hgemm_24x8::operand_type *>(in1_ptr), ldb,
- reinterpret_cast<hgemm_24x8::result_type *>(out.ptr()), ldc,
- _alpha, 1.f, workspace);
- },
- in0, out);
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_UNUSED(window);
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
index fa1d6e37a9..b3fcb33bfb 100644
--- a/arm_compute/core/NEON/kernels/assembly/asmlib.hpp
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,15 +30,22 @@
// Code using these macros needs to clobber x20 and x21 as they might be
// used by the workaround.
-#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
-#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
-#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
+// "Correct" version
+#define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
+#define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
+#define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
#define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
+// Lee's uarchsim hack
+//#define ASM_PREFETCH(address) "LDNP x20, x21, " address "\n"
+
+// No preload at all
+//#define ASM_PREFETCH(address) ""
#else
-#define ASM_PREFETCH(address) "PLD " address "\n"
-#define ASM_PREFETCHW(address) "PLDW " address "\n"
+// "Correct" versions for AArch32
+#define ASM_PREFETCH(address) "PLD " address "\n"
+#define ASM_PREFETCHW(address) "PLDW " address "\n"
#endif
@@ -46,76 +53,76 @@
* Do some prefetches.
*/
template <typename T>
-static inline void prefetch_6x(const T *pfp) {
- __asm __volatile (
+static inline void prefetch_6x(const T *pfp)
+{
+ __asm __volatile(
ASM_PREFETCH("[%[pfp]]")
ASM_PREFETCH("[%[pfp], #64]")
ASM_PREFETCH("[%[pfp], #128]")
ASM_PREFETCH("[%[pfp], #192]")
ASM_PREFETCH("[%[pfp], #256]")
ASM_PREFETCH("[%[pfp], #320]")
- :
- : [pfp] "r" (pfp)
- : "memory"
- );
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
}
template <typename T>
-static inline void prefetch_5x(const T *pfp) {
- __asm __volatile (
+static inline void prefetch_5x(const T *pfp)
+{
+ __asm __volatile(
ASM_PREFETCH("[%[pfp]]")
ASM_PREFETCH("[%[pfp], #64]")
ASM_PREFETCH("[%[pfp], #128]")
ASM_PREFETCH("[%[pfp], #192]")
ASM_PREFETCH("[%[pfp], #256]")
- :
- : [pfp] "r" (pfp)
- : "memory"
- );
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
}
template <typename T>
-static inline void prefetch_4x(const T *pfp) {
- __asm __volatile (
+static inline void prefetch_4x(const T *pfp)
+{
+ __asm __volatile(
ASM_PREFETCH("[%[pfp]]")
ASM_PREFETCH("[%[pfp], #64]")
ASM_PREFETCH("[%[pfp], #128]")
ASM_PREFETCH("[%[pfp], #192]")
- :
- : [pfp] "r" (pfp)
- : "memory"
- );
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
}
template <typename T>
-static inline void prefetch_3x(const T *pfp) {
- __asm __volatile (
+static inline void prefetch_3x(const T *pfp)
+{
+ __asm __volatile(
ASM_PREFETCH("[%[pfp]]")
ASM_PREFETCH("[%[pfp], #64]")
ASM_PREFETCH("[%[pfp], #128]")
- :
- : [pfp] "r" (pfp)
- : "memory"
- );
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
}
template <typename T>
-static inline void prefetch_2x(const T *pfp) {
- __asm __volatile (
+static inline void prefetch_2x(const T *pfp)
+{
+ __asm __volatile(
ASM_PREFETCH("[%[pfp]]")
ASM_PREFETCH("[%[pfp], #64]")
- :
- : [pfp] "r" (pfp)
- : "memory"
- );
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
}
template <typename T>
-static inline void prefetch_1x(const T *pfp) {
- __asm __volatile (
+static inline void prefetch_1x(const T *pfp)
+{
+ __asm __volatile(
ASM_PREFETCH("[%[pfp]]")
- :
- : [pfp] "r" (pfp)
- : "memory"
- );
+ :
+ : [pfp] "r"(pfp)
+ : "memory");
}
diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
new file mode 100644
index 0000000000..dd74744ebc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+
+#ifndef NO_MULTI_THREADING
+#include <atomic>
+#include <mutex>
+
+#define USE_SEMAPHORE
+
+#ifdef USE_SEMAPHORE
+#include <condition_variable>
+#endif
+
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+enum class BufferStatus
+{
+ IDLE,
+ POPULATING,
+ BUSY
+};
+
+class Buffer
+{
+private:
+ const int _maxusers; // Maximum permissible threads.
+ void *const _storage; // Storage for buffer content.
+
+ int _numusers; // Actual number of threads (might be lower).
+
+ volatile BufferStatus _status = BufferStatus::IDLE; // Status
+ std::atomic_int _users = {}; // How many users are still using the buffer.
+ volatile int _index = 0; // Which block of data currently resides in the buffer.
+
+ std::mutex _lock = {};
+#ifdef USE_SEMAPHORE
+ std::condition_variable _cv = {};
+#endif
+
+ template <typename T>
+ void populate_buffer(T func)
+ {
+ func(_storage);
+
+ /* Now mark it as ready. */
+#ifdef USE_SEMAPHORE
+ {
+ std::unique_lock<std::mutex> ul(_lock);
+ _status = BufferStatus::BUSY;
+ _cv.notify_all();
+ }
+#else
+ _status = BufferStatus::BUSY;
+#endif
+ }
+
+public:
+ Buffer(Buffer &) = delete;
+ Buffer &operator=(Buffer &) = delete;
+
+ Buffer(void *storage, int maxusers)
+ : _maxusers(maxusers), _storage(storage), _numusers(maxusers)
+ {
+ _status = BufferStatus::IDLE;
+ }
+
+ /* Try and populate the given index.
+ * Wait if the buffer is busy with previous index, then:
+ *
+ * If the buffer is idle, grab it and populate it.
+ * If it's already being populated by another thread or is ready, return.
+ */
+ template <typename T>
+ void try_populate(const int index, T func)
+ {
+ for(;;)
+ {
+#ifdef USE_SEMAPHORE
+ /* If it's busy with a previous index, wait on the semaphore. */
+ if((_status == BufferStatus::BUSY) && (_index != index))
+ {
+ std::unique_lock<std::mutex> ul(_lock);
+
+ if((_status == BufferStatus::BUSY) && (_index != index))
+ {
+ _cv.wait(ul);
+ }
+ }
+#endif
+ /* Return if another thread is populating it already. */
+ if((_index == index) && ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY)))
+ {
+ return;
+ }
+
+ if(_status == BufferStatus::IDLE)
+ {
+ std::lock_guard<std::mutex> guard(_lock);
+
+ /* If the buffer is still idle, we can grab it and populate it. */
+ if(_status == BufferStatus::IDLE)
+ {
+ _status = BufferStatus::POPULATING;
+ _index = index;
+ _users = _numusers;
+ break;
+ }
+ }
+ }
+
+ /* If we get here, fill in the buffer. */
+ populate_buffer(func);
+ }
+
+ template <typename T>
+ void *get(const int index, T func)
+ {
+ // Loop until we achieve something.
+ for(;;)
+ {
+ // If the index is correct and the buffer status is busy then we can
+ // just return the content. No locking is needed here as the index
+ // cannot change (and status cannot change from BUSY) until all
+ // users have finished.
+ if((_index == index) && (_status == BufferStatus::BUSY))
+ {
+ return _storage;
+ }
+#ifdef USE_SEMAPHORE
+ if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+ {
+ std::unique_lock<std::mutex> ul(_lock);
+
+ if(((_status == BufferStatus::BUSY) && (_index != index)) || (_status == BufferStatus::POPULATING))
+ {
+ _cv.wait(ul);
+ }
+ }
+#endif
+
+ // If it's idle, we need to populate it. The IDLE->POPULATING
+ // transition requires the lock.
+ if(_status == BufferStatus::IDLE)
+ {
+ std::lock_guard<std::mutex> guard(_lock);
+
+ /* If it's still idle, grab it. Otherwise drop through and
+ * we'll do something else next time through the loop. */
+ if(_status == BufferStatus::IDLE)
+ {
+ _status = BufferStatus::POPULATING;
+ _index = index;
+ _users = _numusers;
+ break;
+ }
+ }
+ }
+
+ /* If we get here we need to populate the buffer. */
+ populate_buffer(func);
+
+ return _storage;
+ }
+
+ /* Threads call this when they have finished processing a buffer. We
+ * simply (atomically) decrement the user count, and if it's hit zero we
+ * flag the buffer as idle.
+ */
+ void release(void)
+ {
+ if(--_users == 0)
+ {
+#ifdef USE_SEMAPHORE
+ std::unique_lock<std::mutex> ul(_lock);
+ _status = BufferStatus::IDLE;
+ /* We notify all waiters as we expect one to do the populating
+ * and any others to go and process and earlier block. */
+ _cv.notify_all();
+#else
+ _status = BufferStatus::IDLE;
+#endif
+ }
+ }
+
+ /* This is called to change the number of users. */
+ void set_numusers(int numusers)
+ {
+ _numusers = std::min(numusers, _maxusers);
+ }
+};
+
+class BufferManager
+{
+private:
+ /* This has to be a vector of Buffer *, because a Buffer cannot be moved
+ * or copied due to atomic members. */
+ std::vector<Buffer *> _buffers = {};
+ const int _maxthreads;
+ void *const _storage;
+
+public:
+ BufferManager(BufferManager &) = delete;
+ BufferManager &operator=(BufferManager &) = delete;
+
+ // Say how much storage is needed.
+ static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+ {
+ return buffersize * ((maxthreads == 1) ? 1 : 3);
+ }
+
+ BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+ : _maxthreads(maxthreads), _storage(storage)
+ {
+ const int numbuffers = (maxthreads == 1) ? 1 : 3;
+
+ /* We don't need any Buffer objects in single thread mode. */
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ /* Use intptr_t to avoid performing arithmetic on a void * */
+ intptr_t storage_int = reinterpret_cast<intptr_t>(_storage);
+
+ for(int i = 0; i < numbuffers; i++)
+ {
+ _buffers.push_back(new Buffer(reinterpret_cast<void *>(storage_int), _maxthreads));
+ storage_int += buffersize;
+ }
+ }
+
+ ~BufferManager()
+ {
+ while(_buffers.size())
+ {
+ delete _buffers.back();
+ _buffers.pop_back();
+ }
+ }
+
+ template <typename T>
+ void *get(const int index, T func)
+ {
+ /* In single thread mode, we just directly call the populating
+ * function on the (single) buffer, otherwise forward to the
+ * relevant Buffer. */
+ if(_maxthreads == 1)
+ {
+ func(_storage);
+ return _storage;
+ }
+ else
+ {
+ return _buffers[index % _buffers.size()]->get(index, func);
+ }
+ }
+
+ template <typename T>
+ void try_populate(const int index, T func)
+ {
+ /* No need for this in single thread mode. */
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ _buffers[index % _buffers.size()]->try_populate(index, func);
+ }
+
+ void release(const int index)
+ {
+ /* No need for this in single thread mode. */
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ _buffers[index % _buffers.size()]->release();
+ }
+
+ void set_nthreads(int threads)
+ {
+ if(_maxthreads == 1)
+ {
+ return;
+ }
+
+ for(unsigned int i = 0; i < _buffers.size(); i++)
+ {
+ _buffers[i]->set_numusers(threads);
+ }
+ }
+};
+
+#else
+
+/* Trivial implementation if threading is disabled at compile time.
+ *
+ * Here, we only need storage for a single buffer. The 'get' method needs
+ * to call the supplied function to populate the buffer and then return it.
+ * All the other methods do nothing.
+ */
+
+class BufferManager
+{
+private:
+ void *const _storage;
+
+public:
+ BufferManager(BufferManager &) = delete;
+ BufferManager &operator=(BufferManager &) = delete;
+
+ BufferManager(const int maxthreads, const size_t buffersize, void *storage)
+ : _storage(storage)
+ {
+ }
+
+ ~BufferManager()
+ {
+ }
+
+ // Say how much storage is needed.
+ static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize)
+ {
+ return buffersize;
+ }
+
+ template <typename T>
+ void try_populate(const int index, T func)
+ {
+ }
+
+ void release(const int index)
+ {
+ }
+
+ template <typename T>
+ void *get(const int index, T func)
+ {
+ func(_storage);
+ return _storage;
+ }
+
+ void set_nthreads(int)
+ {
+ }
+};
+
+#endif
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
new file mode 100644
index 0000000000..b9729d4c5c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+
+#include "arm_gemm.hpp"
+
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_hgemm_24x8.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<__fp16, __fp16> gemm(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, const __fp16 alpha, const __fp16 beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+#ifdef __aarch64__
+ /* If FP16 is supported, use it */
+ if(ci.has_fp16())
+ {
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+
+ /* Fallback to using the blocked SGEMM kernel. */
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+ /* For AArch32, only support the SGEMM route. */
+ return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<sgemm_8x6, __fp16, __fp16>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class members
+#ifdef __aarch64__
+const int hgemm_24x8::out_width;
+const int hgemm_24x8::out_height;
+#endif
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
new file mode 100644
index 0000000000..1baa21fd1b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+#include "gemm_native.hpp"
+#include "gemv_native_transposed.hpp"
+#include "gemv_pretransposed.hpp"
+
+#include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_sgemm_native_16x4.hpp"
+#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_sgemv_trans.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<float, float> gemm<float, float>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, const float alpha, const float beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+#ifdef __aarch64__
+ /* Cases in priority order */
+ /* GemvPretransposed: requires M=1, alpha=1, and transposed hint set */
+ if(M == 1 && alpha == 1.0f && pretransposed_hint)
+ {
+ return UniqueGemmCommon<float, float>(new GemvPretransposed<sgemv_pretransposed, float, float>(&ci, N, K, trB, beta));
+ }
+
+ /* GemvNativeTransposed: requires M=1, no trA or trB, doesn't handle beta */
+ if(M == 1 && beta == 1.0f && !trA && !trB)
+ {
+ return UniqueGemmCommon<float, float>(new GemvNativeTransposed<sgemv_trans, float, float>(&ci, N, K, alpha));
+ }
+
+ /* Native GEMM: requires M to be a multiple of 4, K a multiple of 4, N a
+ * multiple of 16, doesn't handle alpha and only makes sense for small
+ * sizes. */
+ if(N <= 128 && K <= 128 && ((M % 4) == 0) && ((K % 4) == 0) && ((N % 16) == 0) && alpha == 1.0f)
+ {
+ return UniqueGemmCommon<float, float>(new GemmNative<sgemm_native_16x4, float, float>(&ci, M, N, K, beta));
+ }
+
+ /* Blocked GEMM, handles all cases. */
+ return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_12x8, float, float>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#else
+ return UniqueGemmCommon<float, float>(new GemmInterleaved<sgemm_8x6, float, float>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+#endif
+}
+
+// Instantiate static class variables.
+#ifdef __aarch64__
+const int sgemm_12x8::out_width;
+const int sgemm_12x8::out_height;
+
+const int sgemm_native_16x4::out_width;
+const int sgemm_native_16x4::out_height;
+#else
+const int sgemm_8x6::out_width;
+const int sgemm_8x6::out_height;
+#endif
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index 75c4dbdaa4..344bfed12b 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -21,28 +21,28 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__
-#define __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__
+#ifdef __aarch64__
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
-namespace arm_compute
-{
-class ITensor;
+#include "kernels/a64_gemm_s16_12x8.hpp"
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEHGEMMAArch64FP16Kernel : public NEGEMMAssemblyBaseKernel
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int16_t, int32_t> gemm<int16_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
{
-public:
- const char *name() const override
- {
- return "NEHGEMMAArch64FP16Kernel";
- }
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
+ return UniqueGemmCommon<int16_t, int32_t>(new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_s16_12x8::out_width;
+const int gemm_s16_12x8::out_height;
+
+} // namespace arm_gemm
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
new file mode 100644
index 0000000000..856d407cfa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<int8_t, int32_t> gemm<int8_t, int32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, const int32_t alpha, const int32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ if(ci.has_dotprod())
+ {
+ // Dot product supporting CPUs. This family has a special version for A55r1.
+ return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+
+ return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+
+ // TODO: There's a better approach for A53, but it doesn't work
+ // well on heterogeneous systems as the required data formats
+ // are different. Figure out how to enable this:
+ // gemm = new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(ci, M, N, K, trA, trB);
+}
+
+// Instantiate static class members
+const int gemm_s8_12x8::out_width;
+const int gemm_s8_12x8::out_height;
+const int gemm_s8_4x4::out_width;
+const int gemm_s8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
new file mode 100644
index 0000000000..27e4e8d411
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "buffer_manager.hpp"
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND 64
+#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm
+{
+template <typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo *const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const bool _trA;
+ const bool _trB;
+
+ const Tr _alpha;
+ const Tr _beta;
+
+ const unsigned int _maxthreads;
+ const bool _pretransposed;
+
+ /* Blocking info */
+ unsigned int _k_block = 0;
+ unsigned int _x_block = 0;
+ unsigned int _Mround = 0;
+
+ /* Working space, pretransposed buffer, buffer manager */
+ const Toi *_B_transposed = nullptr;
+ BufferManager *_bm = nullptr;
+ void *_working_space = nullptr;
+
+ /* We will need to walk through the blocks of B in a few contexts, so
+ * factor that out. */
+ class blockwalker
+ {
+ private:
+ /* Loop parameters, we only block up N and K so don't worry about M. */
+ const unsigned int _Nsize, _Ksize, _x_block, _k_block;
+
+ /* K and X parameters for current iteration. */
+ unsigned int _k0 = 0, _x0 = 0;
+
+ unsigned int _index = 0;
+ bool _done = false;
+ bool _newkblock = true;
+
+ public:
+ blockwalker(const unsigned int K, const unsigned int k_block, const unsigned int N, const unsigned int x_block)
+ : _Nsize(N), _Ksize(K), _x_block(x_block), _k_block(k_block)
+ {
+ }
+
+ unsigned int xmax()
+ {
+ return std::min(_x0 + _x_block, _Nsize);
+ }
+
+ unsigned int kmax()
+ {
+ return std::min(_k0 + _k_block, _Ksize);
+ }
+
+ /* Advance to the next block, return false at the end. */
+ bool advance(void)
+ {
+ if(_done)
+ {
+ return false;
+ }
+
+ _newkblock = false;
+ _x0 += _x_block;
+ if(_x0 >= _Nsize)
+ {
+ _x0 = 0;
+ _k0 += _k_block;
+ if(_k0 >= _Ksize)
+ {
+ _done = true;
+ return false;
+ }
+ _newkblock = true;
+ }
+ _index++;
+
+ return true;
+ }
+
+ unsigned int k0(void)
+ {
+ return _k0;
+ }
+ unsigned int x0(void)
+ {
+ return _x0;
+ }
+ unsigned int index(void)
+ {
+ return _index;
+ }
+ bool done(void)
+ {
+ return _done;
+ }
+ bool newkblock(void)
+ {
+ return _newkblock;
+ }
+ };
+
+ // A working size: One of these needed, regardless of thread count. Divided according to window.
+ size_t get_a_working_size() const
+ {
+ return ROUND_UP(sizeof(Toi) * _k_block * _Mround);
+ }
+
+ // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
+ size_t get_b_working_size() const
+ {
+ return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
+ }
+
+ // C working size: One needed per thread.
+ size_t get_c_working_size() const
+ {
+ return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height);
+ }
+
+ // Internal execute function.
+ // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
+ template <bool pretransposed>
+ void execute_internal(unsigned int start, unsigned int end, int threadid)
+ {
+ profiler prof;
+ strategy strat(_ci);
+
+ blockwalker current(_Ksize, _k_block, _Nsize, _x_block);
+ blockwalker next = current;
+
+ /* Compute the M values to operate on */
+ unsigned int m_0 = start * strategy::out_height;
+ unsigned int m_max = std::min(end * strategy::out_height, _Msize);
+
+ /* Make sure we've been set up correctly. */
+ if(pretransposed)
+ {
+ assert(_B_transposed);
+ }
+ else
+ {
+ assert(_bm);
+ }
+
+ assert(_working_space);
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+ // Private buffers. Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer.
+ Toi *const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) + (m_0 * _k_block * sizeof(Toi)));
+ Tri *const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+
+ // Shared buffers - these come either from BufferManager or _B_transposed.
+ const Toi *b_panel;
+
+ if(pretransposed)
+ {
+ b_panel = _B_transposed;
+ }
+
+ //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+
+ // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+ int kern_k = 0;
+
+ for(; !current.done(); current.advance())
+ {
+ if(current.newkblock())
+ {
+ prof(PROFILE_PREPA, ((m_max - m_0) * (current.kmax() - current.k0()) * sizeof(Toi)), [&](void)
+ {
+ if(_trA ^ strategy::A_transpose)
+ {
+ Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, this->_Aptr, this->_lda, m_0, m_max, current.k0(), current.kmax());
+ }
+ else
+ {
+ Transform<strategy::A_interleave, strategy::A_block, false>(a_panel, this->_Aptr, this->_lda, m_0, m_max, current.k0(), current.kmax());
+ }
+ });
+
+ // Figure out how many "K" the kernel will actually process.
+ kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll);
+ kern_k *= strat.k_unroll;
+ }
+
+ int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width);
+
+ if(!pretransposed)
+ {
+ /* Look ahead to the next block and populate it if necessary.
+ * This avoids the populate operation becoming a bottleneck, and
+ * helps keep the threads synchronized (the first thread to get
+ * here will populate while the rest will advance).
+ *
+ * If we are running single threaded, bm->try_populate() will do
+ * nothing.
+ */
+ if(next.advance())
+ {
+ _bm->try_populate(next.index(), [&](void *buffer)
+ {
+ prof(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi), [&](void)
+ {
+ Toi *b_panel = reinterpret_cast<Toi *>(buffer);
+ if(_trB ^ strategy::B_transpose)
+ {
+ Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, this->_Bptr, this->_ldb, next.x0(), next.xmax(), next.k0(), next.kmax());
+ }
+ else
+ {
+ Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, this->_Bptr, this->_ldb, next.x0(), next.xmax(), next.k0(), next.kmax());
+ }
+ });
+ });
+ }
+
+ /* Get the buffer for this iteration from the BufferManager. */
+ b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv)
+ {
+ prof(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi), [&](void)
+ {
+ Toi *b_panel = reinterpret_cast<Toi *>(bpv);
+ if(_trB ^ strategy::B_transpose)
+ {
+ Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, this->_Bptr, this->_ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+ else
+ {
+ Transform<strategy::B_interleave, strategy::B_block, false>(b_panel, this->_Bptr, this->_ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+ });
+ }));
+ }
+
+ /* Do the actual work. */
+ for(unsigned int y = m_0; y < m_max; y += strategy::out_height)
+ {
+ unsigned int ymax = std::min(_Msize, y + strategy::out_height);
+
+ prof(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k), [&](void)
+ {
+ strat.kernel(a_panel + ((y - m_0) * kern_k), b_panel, c_panel, 1, bblocks, kern_k);
+ });
+ prof(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)), [&](void)
+ {
+ MergeResults<strategy::out_width, strategy::out_height>(this->_Cptr, c_panel, this->_ldc, y, ymax,
+ current.x0(), current.xmax(), _alpha, (current.k0() == 0 ? _beta : static_cast<Tr>(1)));
+ });
+ }
+
+ if(pretransposed)
+ {
+ b_panel += (bblocks * strat.out_width * kern_k);
+ }
+ else
+ {
+ _bm->release(current.index());
+ }
+ }
+ }
+
+public:
+ GemmInterleaved(GemmInterleaved &) = delete;
+ GemmInterleaved &operator=(GemmInterleaved &) = delete;
+
+ /* Constructor */
+ GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, const Tr alpha, const Tr beta, const int maxthreads,
+ const bool pretransposed)
+ : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed)
+ {
+ const unsigned int L1_size = ci->get_L1_cache_size();
+ const unsigned int L2_size = ci->get_L2_cache_size();
+
+ assert(maxthreads > 0);
+
+ // Work out blocking parameters
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width, strategy::out_height)));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ _k_block /= strategy::k_unroll;
+ _k_block = std::max(_k_block, 1U) * strategy::k_unroll;
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ int num_k_blocks = iceildiv(K, _k_block);
+
+ // So divide the space equally into that many blocks.
+ _k_block = iceildiv(K, num_k_blocks);
+
+ // And round UP to the K unroll level required.
+ _k_block = iceildiv(_k_block, strategy::k_unroll);
+ _k_block *= strategy::k_unroll;
+
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ _x_block /= strategy::out_width;
+ _x_block = std::max(_x_block, 1U) * strategy::out_width;
+
+ // And tune to the presented problem size.
+ int num_x_blocks = iceildiv(N, _x_block);
+ _x_block = iceildiv(N, num_x_blocks);
+
+ _x_block = iceildiv(_x_block, strategy::out_width);
+ _x_block *= strategy::out_width;
+
+ // Work out the rounded size of M - needed for some buffers.
+ _Mround = iceildiv(M, strategy::out_height);
+ _Mround *= strategy::out_height;
+ }
+
+ // Interface implementation - Compulsory functions
+
+ // Window size: Only the last thread should do a ragged block, so dole out work in units of out_height */
+ unsigned int get_window_size() const override
+ {
+ // _Mround is a multiple of out_height by definition.
+ return _Mround / strategy::out_height;
+ }
+
+ // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+ void set_nthreads(int nthreads) override
+ {
+ if(_bm)
+ {
+ _bm->set_nthreads(nthreads);
+ }
+ }
+
+ // Execute
+ void execute(unsigned int start, unsigned int end, int threadid) override
+ {
+ if(_pretransposed)
+ {
+ execute_internal<true>(start, end, threadid);
+ }
+ else
+ {
+ execute_internal<false>(start, end, threadid);
+ }
+ }
+
+ // Interface implementation - working space
+ size_t get_working_size() const override
+ {
+ // In all cases, we need one A buffer plus a C buffer per thread.
+ size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+
+ // For pretransposed case, there is no working space needed for B.
+ // Otherwise, we need a BufferManager.
+ if(!_pretransposed)
+ {
+ size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+ }
+
+ size += 64; // Add on a cache line extra for alignment.
+
+ return size;
+ }
+
+ void set_working_space(void *working_space) override
+ {
+ // Make sure everything ends up cache line aligned
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+ intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
+
+ size_t diff = 0;
+
+ if(working_space_int & 0x3F)
+ {
+ diff = 0x40 - (working_space_int & 0x3F);
+ }
+
+ working_space_bytes += diff;
+
+ if(_pretransposed)
+ {
+ // Pretransposed case: just set internal pointer to parameter value.
+ _working_space = reinterpret_cast<void *>(working_space_bytes);
+ }
+ else
+ {
+ // Otherwise, use the first part of the working space for the buffer manager.
+ // It's legal to call this again so don't leak a buffer manager if it already existed.
+ delete _bm;
+
+ _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
+
+ working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
+
+ _working_space = reinterpret_cast<void *>(working_space_bytes);
+ }
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override
+ {
+ return _pretransposed;
+ }
+
+ bool B_pretranspose_required() const override
+ {
+ return _pretransposed && (_B_transposed == nullptr);
+ }
+
+ // TODO: this could almost certainly be considerably simpler.
+ size_t get_B_pretransposed_array_size() const override
+ {
+ size_t total = 0;
+ blockwalker current(_Ksize, _k_block, _Nsize, _x_block);
+
+ do
+ {
+ /* Figure out the size of each block. */
+ size_t x_size = (current.xmax() - current.x0());
+ size_t k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width);
+ x_size *= strategy::out_width;
+
+ k_size = iceildiv(k_size, strategy::k_unroll);
+ k_size *= strategy::k_unroll;
+
+ total += x_size * k_size * sizeof(Toi);
+ }
+ while(current.advance());
+
+ return total;
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb) override
+ {
+ blockwalker current(_Ksize, _k_block, _Nsize, _x_block);
+ Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+ _B_transposed = buffer;
+
+ do
+ {
+ /* Figure out the size of each block. */
+ size_t x_size = (current.xmax() - current.x0());
+ size_t k_size = (current.kmax() - current.k0());
+
+ /* Round sizes up as needed. */
+ x_size = iceildiv(x_size, strategy::out_width);
+ x_size *= strategy::out_width;
+
+ k_size = iceildiv(k_size, strategy::k_unroll);
+ k_size *= strategy::k_unroll;
+
+ if(_trB ^ strategy::B_transpose)
+ {
+ Transform<strategy::B_interleave, strategy::B_block, true>(buffer, B, ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+ else
+ {
+ Transform<strategy::B_interleave, strategy::B_block, false>(buffer, B, ldb, current.x0(), current.xmax(), current.k0(), current.kmax());
+ }
+
+ buffer += (x_size * k_size);
+ }
+ while(current.advance());
+ }
+
+ ~GemmInterleaved() override
+ {
+ delete _bm;
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
new file mode 100644
index 0000000000..b0192793b9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for native GEMM with no transposition.
+//
+// By default the source data is used in-place, but if type conversion is
+// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
+
+template <typename strategy, typename To, typename Tr>
+class GemmNative : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ Tr _beta;
+
+ const CPUInfo *const _ci;
+
+ unsigned int k_block = 0;
+ unsigned int n_block = 0;
+
+public:
+ GemmNative(GemmNative &) = delete;
+ GemmNative &operator=(GemmNative &) = delete;
+
+ GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const Tr beta)
+ : _Msize(M), _Nsize(N), _Ksize(K), _beta(beta), _ci(ci)
+ {
+ /* For now don't do any blocking. TODO: figure out if we should. */
+ k_block = K;
+ n_block = N;
+ }
+
+ // Window is number of out_height blocks
+ unsigned int get_window_size() const override
+ {
+ return iceildiv(_Msize, strategy::out_height);
+ }
+
+ // Actually execute the GEMM.
+ void execute(unsigned int start, unsigned int end, int) override
+ {
+ profiler prof;
+ strategy strat(_ci);
+
+ unsigned int M_start = start * strategy::out_height;
+ unsigned int M_end = std::min(end * strategy::out_height, _Msize);
+
+ static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+ static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+ for(unsigned int y0 = M_start; y0 < M_end; y0 += strategy::out_height)
+ {
+ unsigned int ymax = std::min(y0 + strategy::out_height, M_end);
+
+ prof(PROFILE_KERNEL, (ymax - y0) * _Nsize * _Ksize, [&](void)
+ {
+ strat.kernel(this->_Aptr + (y0 * this->_lda), this->_lda, this->_Bptr, this->_ldb, this->_Cptr + (y0 * this->_ldc), this->_ldc, _beta, (ymax - y0), _Nsize, _Ksize);
+ });
+ }
+ }
+};
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 9fb3ce415a..3e790e1b2a 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -21,28 +21,28 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__
-#define __ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__
+#ifdef __aarch64__
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
-namespace arm_compute
-{
-class ITensor;
+#include "kernels/a64_gemm_u16_12x8.hpp"
-/** AArch64 NEON kernel to multiply an input vector "A" and a matrix "B". */
-class NEGEMVAArch64Kernel : public NEGEMMAssemblyBaseKernel
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint16_t, uint32_t> gemm<uint16_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, uint32_t alpha, uint32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
{
-public:
- const char *name() const override
- {
- return "NEGEMVAArch64Kernel";
- }
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
+ return UniqueGemmCommon<uint16_t, uint32_t>(new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+}
+
+// Instantiate static class members
+const int gemm_u16_12x8::out_width;
+const int gemm_u16_12x8::out_height;
+
+} // namespace arm_gemm
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMVAARCH64KERNEL_H__*/
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
new file mode 100644
index 0000000000..9ec479ca7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "gemm_common.hpp"
+#include "gemm_interleaved.hpp"
+
+#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+
+namespace arm_gemm
+{
+template <>
+UniqueGemmCommon<uint8_t, uint32_t> gemm<uint8_t, uint32_t>(const CPUInfo &ci, const unsigned int M, const unsigned int N, const unsigned int K,
+ const bool trA, const bool trB, const uint32_t alpha, const uint32_t beta,
+ const int maxthreads, const bool pretransposed_hint)
+{
+ if(ci.has_dotprod())
+ {
+ // Dot product supporting CPUs. This family has a special version for A55r1.
+ return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+ }
+
+ // Non dot-product code.
+ return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(&ci, M, N, K, trA, trB, alpha, beta, maxthreads, pretransposed_hint));
+
+ // TODO: There's a better approach for A53, but it doesn't work
+ // well on heterogeneous systems as the required data formats
+ // are different. Figure out how to enable this:
+ // gemm = new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(ci, M, N, K, trA, trB);
+}
+
+// Instantiate static class members
+const int gemm_u8_12x8::out_width;
+const int gemm_u8_12x8::out_height;
+
+const int gemm_u8_4x4::out_width;
+const int gemm_u8_4x4::out_height;
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
new file mode 100644
index 0000000000..c0b886266d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for a "native" (no-transform) GEMV with a
+// transposed matrix.
+//
+// As a native operation the source data is used in-place, so the internal
+// and external operand/result types must match.
+template <typename strategy, typename To, typename Tr>
+class GemvNativeTransposed : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const Tr _alpha;
+
+ const CPUInfo *const _ci;
+
+ unsigned int m_block = 0;
+ unsigned int n_block = 0;
+
+public:
+ GemvNativeTransposed(GemvNativeTransposed &) = delete;
+ GemvNativeTransposed &operator=(GemvNativeTransposed &) = delete;
+
+ GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const Tr alpha)
+ : _Nsize(N), _Ksize(K), _alpha(alpha), _ci(ci)
+ {
+ /* For now don't do any blocking. TODO: figure out if we should. */
+ m_block = K;
+ n_block = N;
+ }
+
+ // Window is number of out_width blocks.
+ unsigned int get_window_size() const override
+ {
+ return iceildiv(_Nsize, strategy::out_width);
+ }
+
+ // Actually execute the GEMV.
+ void execute(unsigned int start, unsigned int end, int) override
+ {
+ profiler prof;
+ strategy strat(_ci);
+
+ unsigned int N_start = start * strategy::out_width;
+ unsigned int N_end = std::min(end * strategy::out_width, _Nsize);
+
+ static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
+ static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
+
+ for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+ {
+ unsigned int mmax = std::min(m0 + m_block, _Ksize);
+
+ for(unsigned int n0 = N_start; n0 < N_end; n0 += n_block)
+ {
+ unsigned int nmax = std::min(n0 + n_block, N_end);
+
+ prof(PROFILE_KERNEL, ((mmax - m0) * (nmax - n0)), [&](void)
+ {
+ strat.kernel(this->_Bptr + (m0 * this->_ldb) + n0, this->_Aptr + m0, this->_Cptr + n0,
+ _alpha, this->_ldb, (mmax - m0), (nmax - n0));
+ });
+ }
+ }
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
new file mode 100644
index 0000000000..0df331acb4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <stdio.h>
+
+#include "arm_gemm.hpp"
+
+#include "mergeresults.hpp"
+#include "profiler.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm
+{
+// Implementation of the GemmCommon abstract class.
+//
+// This is implementation is for GEMV with a transposed matrix.
+//
+// By default the source data is used in-place, but if type conversion is
+// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
+
+template <typename strategy, typename To, typename Tr>
+class GemvPretransposed : public GemmCommon<To, Tr>
+{
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const bool _trB;
+
+ const Tr _beta;
+
+ const CPUInfo *const _ci;
+
+ unsigned int m_block = 0;
+ unsigned int n_block = 0;
+
+ const Toi *_A_pretransposed = nullptr;
+
+public:
+ GemvPretransposed(GemvPretransposed &) = delete;
+ GemvPretransposed &operator=(GemvPretransposed &) = delete;
+
+ GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const bool trB, const Tr beta)
+ : _Nsize(N), _Ksize(K), _trB(trB), _beta(beta), _ci(ci)
+ {
+ /* For now don't do any blocking. TODO: figure out if we should. */
+ m_block = K;
+ n_block = N;
+ }
+
+ // Window is number of out_width blocks.
+ unsigned int get_window_size() const override
+ {
+ return iceildiv(_Nsize, strategy::out_width);
+ }
+
+ // Actually execute the GEMV.
+ void execute(unsigned int start, unsigned int end, int) override
+ {
+ profiler prof;
+ strategy strat(_ci);
+
+ unsigned int N_start = start * strategy::out_width;
+ unsigned int N_end = std::min(end * strategy::out_width, _Nsize);
+
+ static_assert(std::is_same<Tr, Tri>::value, "GemvPretransposed: Result types must be the same.");
+
+ for(unsigned int m0 = 0; m0 < _Ksize; m0 += m_block)
+ {
+ unsigned int mmax = std::min(m0 + m_block, _Ksize);
+
+ for(unsigned int n0 = N_start; n0 < N_end; n0 += n_block)
+ {
+ unsigned int nmax = std::min(n0 + n_block, N_end);
+
+ prof(PROFILE_KERNEL, ((mmax - m0) * (nmax - n0)), [&](void)
+ {
+ /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
+ strat.kernel(_A_pretransposed + (n0 * _Ksize) + (m0 * strategy::A_interleave), (_Ksize * strategy::A_interleave), this->_Aptr + m0, this->_Cptr + n0, _beta, (mmax - m0), (nmax - n0));
+ });
+ }
+ }
+ }
+
+ /* Pretransposed interface implementation */
+ bool B_is_pretransposed() const override
+ {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override
+ {
+ /* Transpose is required if _A_pretransposed is still nullptr */
+ return (_A_pretransposed == nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override
+ {
+ return _Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave * sizeof(float);
+ }
+
+ void pretranspose_B_array(void *buffer, const To *B, const int ldb) override
+ {
+ Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
+
+ /* Reverse sense here as we are dealing with B rather than A. So if
+ * strategy::A_transpose is false and _trB is false, we still
+ * transpose. */
+ if(_trB ^ strategy::A_transpose)
+ {
+ Transform<strategy::A_interleave, strategy::A_block, false>(A_buffer, B, ldb, 0, _Nsize, 0, _Ksize);
+ }
+ else
+ {
+ Transform<strategy::A_interleave, strategy::A_block, true>(A_buffer, B, ldb, 0, _Nsize, 0, _Ksize);
+ }
+
+ _A_pretransposed = A_buffer;
+ }
+};
+
+} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index d78d33c647..de11dc582c 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,12 @@
#ifdef __arm__
+namespace arm_gemm
+{
// Actual kernel implementations
-#include "a32_sgemm_8x6/a53.hpp"
-#include "a32_sgemm_8x6/a55r1.hpp"
-#include "a32_sgemm_8x6/generic.hpp"
+void a32_sgemm_8x6(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a53(const float *, const float *, float *, int, int, int);
+void a32_sgemm_8x6_a55r1(const float *, const float *, float *, int, int, int);
// 8x6 SGEMM "strategy" class.
//
@@ -38,7 +40,8 @@
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class sgemm_8x6 {
+class sgemm_8x6
+{
public:
typedef float operand_type;
typedef float result_type;
@@ -47,28 +50,30 @@ public:
/* Describes the data layout for A input */
static const int A_interleave = 6;
- static const int A_block = 1;
- static const int A_transpose = 0;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
/* Same for B input */
static const int B_interleave = 8;
- static const int B_block = 1;
- static const int B_transpose = 1;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
/* Kernel blocking parameters */
- static const int out_width = 8;
+ static const int out_width = 8;
static const int out_height = 6;
- static const int k_unroll = 1;
+ static const int k_unroll = 1;
- kern_type kernel = nullptr;
+ kern_type kernel = a32_sgemm_8x6;
- sgemm_8x6(const CPUInfo *ci) {
- switch(ci->CPU) {
- case CPUTarget::A53:
+ sgemm_8x6(const CPUInfo *ci)
+ {
+ switch(ci->get_cpu_model())
+ {
+ case CPUModel::A53:
kernel = a32_sgemm_8x6_a53;
break;
- case CPUTarget::A55_DOT:
+ case CPUModel::A55r1:
kernel = a32_sgemm_8x6_a55r1;
break;
@@ -79,4 +84,5 @@ public:
}
};
+} // namespace arm_gemm
#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
new file mode 100644
index 0000000000..428498f79e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int tails = (K & 3);
+ if(tails == 0)
+ {
+ tails = 4;
+ }
+ int k = ((K + 3) / 4) - 1;
+
+ __asm __volatile(
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]\n"
+ "vmov.i32 q6, #0\n"
+ "ldr r0, [%[a_ptr], #0x10]\n"
+ "vmov.i32 q7, #0\n"
+ "ldr r1, [%[a_ptr], #0x14]\n"
+ "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32 q9, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32 q10, #0\n" ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32 q11, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x80]")
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32 q14, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]")
+ "vmov.i32 q15, #0\n"
+ "cmp %[k], #0\n"
+ "beq 6f\n"
+
+ "1:\n"
+ // Unroll 0
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmov d2, r0, r1\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "ldr r0, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "ldr r1, [%[b_ptr], #0x1C]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x100]")
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x20]\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "ldr r0, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "ldr r1, [%[b_ptr], #0x2C]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "ldr r0, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "ldr r1, [%[a_ptr], #0x2C]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+
+ // Unroll 1
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmov d1, r0, r1\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "ldr r0, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "ldr r1, [%[b_ptr], #0x3C]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x100]")
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "ldr r0, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "ldr r1, [%[b_ptr], #0x4C]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "ldr r0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "ldr r1, [%[a_ptr], #0x44]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+
+ // Unroll 2
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmov d0, r0, r1\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "ldr r0, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "ldr r1, [%[b_ptr], #0x5C]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "ldr r0, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "ldr r1, [%[b_ptr], #0x6C]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "ldr r0, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "ldr r1, [%[a_ptr], #0x5C]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+
+ // Unroll 3
+ "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmov d3, r0, r1\n"
+ "vmla.f32 q4, q2, d1[0]\n"
+ "ldr r0, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "ldr r1, [%[b_ptr], #0x7C]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+
+ "vldr d0, [%[a_ptr], #0x00]\n"
+ "vmov d7, r0, r1\n"
+ "vmla.f32 q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0xC0]")
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+
+ "vldr d4, [%[b_ptr], #0x00]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "ldr r0, [%[b_ptr], #0x08]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "ldr r1, [%[b_ptr], #0x0C]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "subs %[k], %[k], #1\n"
+
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmov d5, r0, r1\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "ldr r0, [%[a_ptr], #0x10]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "ldr r1, [%[a_ptr], #0x14]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "6:\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration - for now adapt the generic
+ // tails rather than reimplementing for A53.
+
+ // Unroll 0
+ "vmov d2, r0, r1\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x10\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmov d2, r0, r1\n"
+ "add %[b_ptr], %[b_ptr], #0x10\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
new file mode 100644
index 0000000000..4cfb72a455
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ /* Work out starting values for "k" and "tails" in the inner loop. */
+ int tails_initial = (K & 3);
+ if(tails_initial == 0)
+ {
+ tails_initial = 4;
+ }
+
+ int k_initial = ((K + 3) / 4) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ int tails = tails_initial;
+ int k = k_initial;
+
+ a_ptr = a_ptr0;
+
+ __asm __volatile(
+ "vldr d0, [%[a_ptr]]\n"
+ "vmov.i32 q4, #0\n"
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmov.i32 q5, #0\n"
+ "vldr d4, [%[b_ptr]]\n"
+ "vmov.i32 q6, #0\n"
+ "vldr d5, [%[b_ptr], #0x08]\n"
+ "vmov.i32 q7, #0\n"
+ "vldr d2, [%[a_ptr], #0x10]\n"
+ "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[b_ptr], #0x40]") "vmov.i32 q9, #0\n" ASM_PREFETCH("[%[a_ptr], #0x40]") "vmov.i32 q10, #0\n" ASM_PREFETCH("[%[b_ptr], #0x80]") "vmov.i32 q11, #0\n"
+ ASM_PREFETCH("[%[a_ptr], #0x80]") "vmov.i32 q12, #0\n" ASM_PREFETCH("[%[b_ptr], #0XC0]") "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #0xC0]") "vmov.i32 q14, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #0x100]") "vmov.i32 q15, #0\n" ASM_PREFETCH("[%[a_ptr], #0x100]") "cmp %[k], #0\n" ASM_PREFETCH("[%[b_ptr], #0x140]") "beq 6f\n"
+ ASM_PREFETCH("[%[b_ptr], #0x180]")
+
+ "1:\n"
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmla.f32 q7, q2, d1[1]\n" ASM_PREFETCH("[%[a_ptr], #0x140]")
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vldr d4, [%[b_ptr], #0x20]\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vldr d5, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+
+ "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #0x1C0]")
+ "vmla.f32 q8, q2, d1[0]\n"
+
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vldr d5, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+
+ "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #0x180]")
+ "vmla.f32 q8, q2, d0[0]\n"
+
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vldr d5, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vldr d7, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vldr d0, [%[a_ptr], #0x00]\n"
+ "vmla.f32 q7, q2, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #0x180]")
+ "vmla.f32 q8, q2, d3[0]\n"
+
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vldr d4, [%[b_ptr], #0x00]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vldr d5, [%[b_ptr], #0x08]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x08]\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vldr d2, [%[a_ptr], #0x10]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "6:\n"
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration
+
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vldr d3, [%[a_ptr], #0x18]\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vldr d4, [%[b_ptr], #0x20]\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vldr d5, [%[b_ptr], #0x28]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x20]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "add %[b_ptr], %[b_ptr], #0x30\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vldr d1, [%[a_ptr], #0x28]\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vldr d2, [%[a_ptr], #0x30]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+
+ "vmla.f32 q9, q2, d1[1]\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vldr d4, [%[b_ptr], #0x40]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vldr d5, [%[b_ptr], #0x48]\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vldr d3, [%[a_ptr], #0x38]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vldr d0, [%[a_ptr], #0x40]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vldr d1, [%[a_ptr], #0x48]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vldr d4, [%[b_ptr], #0x60]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vldr d5, [%[b_ptr], #0x68]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vldr d2, [%[a_ptr], #0x50]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vldr d3, [%[a_ptr], #0x58]\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vldr d6, [%[b_ptr], #0x70]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vldr d7, [%[b_ptr], #0x78]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x60\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x80\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vldr d6, [%[b_ptr], #0x10]\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vldr d7, [%[b_ptr], #0x18]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x18\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x20\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vldr d6, [%[b_ptr], #0x30]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vldr d7, [%[b_ptr], #0x38]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x40\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x30\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vldr d6, [%[b_ptr], #0x50]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vldr d7, [%[b_ptr], #0x58]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "add %[a_ptr], %[a_ptr], #0x48\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "add %[b_ptr], %[b_ptr], #0x60\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0", "r1");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
new file mode 100644
index 0000000000..d7d0484610
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 6xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 8xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 8x6), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a32_sgemm_8x6(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int tails = (K & 3);
+ if(tails == 0)
+ {
+ tails = 4;
+ }
+ int k = ((K + 3) / 4) - 1;
+
+ __asm __volatile(
+ "vmov.i32 q4, #0\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmov.i32 q5, #0\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+ "vmov.i32 q6, #0\n" ASM_PREFETCH("[%[a_ptr], #48]") "vmov.i32 q7, #0\n" ASM_PREFETCH("[%[b_ptr], #48]") "vmov.i32 q8, #0\n" ASM_PREFETCH("[%[a_ptr], #112]") "vmov.i32 q9, #0\n"
+ ASM_PREFETCH("[%[b_ptr], #112]")
+ "vmov.i32 q10, #0\n"
+ "vmov.i32 q11, #0\n"
+ "vmov.i32 q12, #0\n"
+ "vmov.i32 q13, #0\n" ASM_PREFETCH("[%[a_ptr], #176]") "vmov.i32 q14, #0\n" ASM_PREFETCH("[%[b_ptr], #176]")
+ "vmov.i32 q15, #0\n"
+
+ "cmp %[k], #0\n"
+ "beq 6f\n"
+
+ "1:\n"
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "subs %[k], %[k], #1\n"
+ "vmla.f32 q5, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #208]")
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+
+ // Unroll 2
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n" ASM_PREFETCH("[%[a_ptr], #240]")
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n" ASM_PREFETCH("[%[b_ptr], #208]")
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "bne 1b\n"
+
+ // Branch here if we never execute main loop.
+ "6:\n"
+
+ // "Tails" shows how many multiply blocks are needed at the
+ // end, must be 1-4 inclusive. Bail out to alternative tail
+ // immediately if it's 1.
+ "subs %[tails], %[tails], #1\n"
+ "beq 3f\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 4f\n"
+
+ // Unroll 1
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "subs %[tails], %[tails], #1\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "beq 5f\n"
+
+ // Unroll 2
+ "vld1.32 {d0-d1}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vld1.32 {d4-d5}, [%[b_ptr] :128]!\n"
+
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vld1.32 {d2-d3}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+
+ // Unroll 3
+ "vmla.f32 q4, q2, d1[0]\n"
+ "vmla.f32 q10, q3, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d1[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d1[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d2[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d2[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d2[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d2[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d3[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d3[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d3[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d3[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==1 final tail
+ "3:\n"
+ "vmla.f32 q4, q2, d0[0]\n"
+ "vld1.32 {d2}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d0[1]\n"
+ "vld1.32 {d6-d7}, [%[b_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d1[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d0[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d0[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d1[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d1[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d1[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d2[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d2[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d2[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d2[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==2 final tail
+ "4:\n"
+ "vmla.f32 q4, q2, d3[0]\n"
+ "vmla.f32 q10, q3, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q5, q2, d3[1]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d3[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q6, q2, d0[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d0[0]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d0[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d0[1]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d1[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d1[0]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d1[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d1[1]\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+ "b 2f\n"
+
+ // tails==3 final tail
+ "5:\n"
+ "vmla.f32 q4, q2, d2[0]\n"
+ "vld1.32 {d0}, [%[a_ptr] :64]!\n"
+ "vmla.f32 q5, q2, d2[1]\n"
+ "vmla.f32 q6, q2, d3[0]\n"
+ "vst1.32 {d8-d9}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q10, q3, d2[0]\n"
+ "vst1.32 {d20-d21}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q11, q3, d2[1]\n"
+ "vst1.32 {d10-d11}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q12, q3, d3[0]\n"
+ "vst1.32 {d22-d23}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q7, q2, d3[1]\n"
+ "vst1.32 {d12-d13}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q13, q3, d3[1]\n"
+ "vst1.32 {d24-d25}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q8, q2, d0[0]\n"
+ "vst1.32 {d14-d15}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q14, q3, d0[0]\n"
+ "vst1.32 {d26-d27}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q9, q2, d0[1]\n"
+ "vst1.32 {d16-d17}, [%[c_ptr] :128]!\n"
+ "vmla.f32 q15, q3, d0[1]\n"
+ "vst1.32 {d28-d29}, [%[c_ptr] :128]!\n"
+ "vst1.32 {d18-d19}, [%[c_ptr] :128]!\n"
+
+ "2:\n"
+ "vst1.32 {d30-d31}, [%[c_ptr] :128]!\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k), [tails] "+r"(tails)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index f7659b9a67..387f899b20 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,8 +25,10 @@
#ifdef __aarch64__
+namespace arm_gemm
+{
// Actual kernel implementations
-#include "a64_gemm_s16_12x8/generic.hpp"
+void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
// 12x8 SGEMM "strategy" class.
//
@@ -36,7 +38,8 @@
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class gemm_s16_12x8 {
+class gemm_s16_12x8
+{
public:
typedef int16_t operand_type;
typedef int32_t result_type;
@@ -45,24 +48,26 @@ public:
/* Describes the data layout for A input */
static const int A_interleave = 8;
- static const int A_block = 1;
- static const int A_transpose = 0;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
/* Same for B input */
static const int B_interleave = 12;
- static const int B_block = 1;
- static const int B_transpose = 1;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
/* Kernel blocking parameters */
- static const int out_width = 12;
+ static const int out_width = 12;
static const int out_height = 8;
- static const int k_unroll = 1;
+ static const int k_unroll = 1;
- kern_type kernel = nullptr;
+ kern_type kernel = a64_gemm_s16_asimd_12x8;
- gemm_s16_12x8(const CPUInfo *ci) {
- kernel = a64_gemm_s16_asimd_12x8;
+ gemm_s16_12x8(const CPUInfo *ci)
+ {
}
};
+} // namespace arm_gemm
+
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
new file mode 100644
index 0000000000..b217dcf2cf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const int16_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int16_t *a_ptr0 = a_ptr;
+ const int16_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ const bool odd_k = K & 0x1;
+ int k = (K + 1) / 2 - 1;
+
+ register int16x8_t aa asm("v0");
+ register int16x8_t ab asm("v1");
+ register int16x8_t b0 asm("v2");
+ register int16x8_t b1 asm("v3");
+ register int16x8_t b2 asm("v4");
+
+ __asm __volatile(
+ "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+ "movi v5.4s, #0\n"
+ "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+ "movi v6.4s, #0\n"
+ "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper
+ "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v8.4s, #0\n"
+ "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+ "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v10.4s, #0\n"
+ "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper
+ "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+ "movi v12.4s, #0\n"
+ "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+ "movi v14.4s, #0\n"
+ "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0\n"
+ "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v18.4s, #0\n"
+ "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+ "movi v20.4s, #0\n"
+ "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+ "movi v22.4s, #0\n"
+ "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v24.4s, #0\n"
+ "add %x[a_ptr], %x[a_ptr], #0x10\n"
+ "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v26.4s, #0\n"
+ "add %x[b_ptr], %x[b_ptr], #0x18\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+
+ "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+ "1:\n" // Main loop
+ // First unroll
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+ "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ // Second unroll
+ "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper
+ "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+ "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "add %x[a_ptr], %x[a_ptr], #0x20\n"
+ "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "subs %x[k], %x[k], #0x1\n"
+ "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper
+ "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "add %x[b_ptr], %x[b_ptr], #0x30\n"
+ "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "bne 1b\n"
+
+ "2:\n" // Even tail
+ "cbnz %x[odd_k], 3f\n"
+
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "add %[a_ptr], %[a_ptr], #0x10\n"
+ "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "add %[b_ptr], %[b_ptr], #0x18\n"
+ "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "smlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+ "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "smlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "smlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "smlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "smlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "smlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "smlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "smlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+ "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "b 4f\n" // Complete write out
+
+ "3:\n" // Odd tail
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "smlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "smlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "smlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "smlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "smlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "smlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "smlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "smlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "smlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "smlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "smlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "smlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "smlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "smlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "smlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+
+ "4:\n" // End of function
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "str q27, [%x[c_ptr], #0x140]\n"
+ "str q12, [%x[c_ptr], #0x150]\n"
+ "str q20, [%x[c_ptr], #0x160]\n"
+ "str q28, [%x[c_ptr], #0x170]\n"
+ "add %x[c_ptr], %x[c_ptr], #0x180\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+ [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+ : [odd_k] "r"(odd_k)
+ : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index 88cbb361b3..08f90e16ed 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,37 +25,48 @@
#ifdef __aarch64__
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
// Load the actual kernel
-#include "a64_gemm_s8_12x8/generic.hpp"
+void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class gemm_s8_12x8 {
+class gemm_s8_12x8
+{
public:
- typedef int8_t operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Describes the data layout for A input */
- static const int A_interleave = 8;
- static const int A_block = 4;
- static const bool A_transpose = false;
+ static const int A_interleave = 8;
+ static const int A_block = 4;
+ static const bool A_transpose = false;
/* Same for B input */
- static const int B_interleave = 12;
- static const int B_block = 4;
- static const bool B_transpose = true;
+ static const int B_interleave = 12;
+ static const int B_block = 4;
+ static const bool B_transpose = true;
/* Kernel blocking parameters */
- static const int out_width = 12;
+ static const int out_width = 12;
static const int out_height = 8;
- static const int k_unroll = 4;
+ static const int k_unroll = 4;
- kern_type kernel = nullptr;
+ kern_type kernel = a64_gemm_s8_12x8;
- gemm_s8_12x8(const CPUInfo *ci) {
- kernel = a64_gemm_s8_12x8;
+ gemm_s8_12x8(const CPUInfo *ci)
+ {
+ if(ci->get_cpu_model() == CPUModel::A55r1)
+ {
+ kernel = a64_gemm_s8_12x8_a55r1;
+ }
}
};
-#endif // __aarch64__
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
new file mode 100644
index 0000000000..ef2f29183c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+ const int W = K / 4;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int k_iters = ((W + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = k_iters;
+
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_SDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "b.ne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "b 3f\n"
+
+ // Odd K continuation
+ "2:\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem sdot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000000..c76f99d776
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_SDOT \
+ ".altmacro\n" \
+ ".macro sdot opd:req, opn:req, opm:req\n" \
+ "local vd, vn, vm, h, l\n" \
+ ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+ ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \
+ ".set vd,\\reg\n" \
+ ".endif\n" \
+ ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \
+ ".set vn,\\reg\n" \
+ ".endif\n" \
+ ".irp idx,0,1,2,3\n" \
+ ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \
+ ".set vm,\\reg\n" \
+ ".set h,\\idx / 2\n" \
+ ".set l,\\idx %% 2\n" \
+ ".endif\n" \
+ ".endr\n" \
+ ".endr\n" \
+ ".ifndef vd\n" \
+ ".error \"Bad operand \\opd\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vn\n" \
+ ".error \"Bad operand \\opn\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vm\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef h\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef l\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \
+ ".endm\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
new file mode 100644
index 0000000000..258ef5e224
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+ // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+ const int W = K / 4;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int init_value_k = ((W + 1) / 2) - 1;
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = init_value_k;
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_SDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem sdot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index 1588f049f4..2ec28f480c 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,37 +25,43 @@
#ifdef __aarch64__
+namespace arm_gemm
+{
// Load the actual kernel
-#include "a64_gemm_s8_4x4/generic.hpp"
+void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class gemm_s8_4x4 {
+#include "arm_gemm.hpp"
+
+class gemm_s8_4x4
+{
public:
- typedef int8_t operand_type;
+ typedef int8_t operand_type;
typedef int32_t result_type;
typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
/* Describes the data layout for A input */
- static const int A_interleave = 4;
- static const int A_block = 16;
- static const bool A_transpose = false;
+ static const int A_interleave = 4;
+ static const int A_block = 16;
+ static const bool A_transpose = false;
/* Same for B input */
- static const int B_interleave = 4;
- static const int B_block = 16;
- static const bool B_transpose = true;
+ static const int B_interleave = 4;
+ static const int B_block = 16;
+ static const bool B_transpose = true;
/* Kernel blocking parameters */
- static const int out_width = 4;
+ static const int out_width = 4;
static const int out_height = 4;
- static const int k_unroll = 16;
+ static const int k_unroll = 16;
- kern_type kernel = nullptr;
+ kern_type kernel = a64_gemm_s8_4x4;
- gemm_s8_4x4(const CPUInfo *ci) {
- kernel = a64_gemm_s8_4x4;
+ gemm_s8_4x4(const CPUInfo *ci)
+ {
}
};
-#endif // __aarch64__
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
new file mode 100644
index 0000000000..243b94e25b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const int8_t *a_ptr = Apanel;
+ int32_t *c_ptr = Cpanel;
+
+ K /= 16;
+ int oddk = (K & 1);
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const int8_t *a_ptr0 = a_ptr;
+ const int8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+
+ int k = ((K + 1) / 2) - 1;
+
+ register int8x16_t b0 asm("v4");
+ register int8x16_t b1 asm("v5");
+ register int8x16_t b2 asm("v6");
+ register int8x16_t b3 asm("v7");
+ register int8x16_t b0a asm("v8");
+ register int8x16_t b1a asm("v9");
+ register int8x16_t b2a asm("v10");
+ register int8x16_t b3a asm("v11");
+
+ __asm __volatile(
+ "movi v16.4s, #0x0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "movi v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]") "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+ // Loop structure optimized for A57 (after r0).
+
+ // Unavoidably, the multiply will "dribble" if
+ // dual issued with an add.
+
+ // Minimize the effect of this by making sure
+ // there are 2 adds to run under the dribbled
+ // multiply.
+
+ // Pipeline in blocks of 8 multiplies - combine
+ // this iteration's multiplies with adds from
+ // the previous iteration.
+
+ // So the first block doesn't have any adds to
+ // do - but because all the adds are at the
+ // start of the block it's only the first couple
+ // of multiplies that need to be pulled out.
+
+ // Start of unroll 0 (first iteration)
+ "smull v12.8h, v0.8b, %[b0].8b\n"
+ "smull v13.8h, v0.8b, %[b1].8b\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Unroll 0 continuation (branch target)
+ "1:\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "subs %w[k], %w[k], #1\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "ldr %q[b0a], [%[b_ptr], #64]\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "ldr %q[b1a], [%[b_ptr], #80]\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr], #64]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "ldr %q[b2a], [%[b_ptr], #96]\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "ldr %q[b3a], [%[b_ptr], #112]\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "add %[b_ptr], %[b_ptr], #128\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #80]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #96]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "ldr %q[b0], [%[b_ptr], #0]\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr q3, [%[a_ptr], #112]\n"
+
+ // Unroll 1
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0a].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1a].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "smull v14.8h, v0.8b, %[b2a].8b\n"
+ "smull v15.8h, v0.8b, %[b3a].8b\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
+ "ldr q0, [%[a_ptr], #128]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0a].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1a].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "smull v14.8h, v1.8b, %[b2a].8b\n"
+ "smull v15.8h, v1.8b, %[b3a].8b\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0a].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1a].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2a].8b\n"
+ "smull v15.8h, v2.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0a].16b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2a].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0a].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1a].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2a].8b\n"
+ "smull v15.8h, v3.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+
+ // Start of unroll 0 for next iteration.
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "bne 1b\n"
+
+ // Target to use when K=1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "ldr %q[b0a], [%[b_ptr], #64]\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "ldr %q[b1a], [%[b_ptr], #80]\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr], #64]\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "ldr %q[b2a], [%[b_ptr], #96]\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "ldr %q[b3a], [%[b_ptr], #112]\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "add %[b_ptr], %[b_ptr], #128\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #80]\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #96]\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr q3, [%[a_ptr], #112]\n"
+
+ // Unroll 1
+ "sadalp v28.4s, v12.8h\n"
+ "smull v12.8h, v0.8b, %[b0a].8b\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "smull v13.8h, v0.8b, %[b1a].8b\n"
+ "sadalp v31.4s, v15.8h\n"
+ "smull v14.8h, v0.8b, %[b2a].8b\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "smull v15.8h, v0.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v0.16b, %[b0a].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v0.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3a].16b\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0a].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1a].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2a].8b\n"
+ "smull v15.8h, v1.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0a].16b\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smlal2 v13.8h, v1.16b, %[b1a].16b\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smlal2 v14.8h, v1.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3a].16b\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0a].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1a].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smull v14.8h, v2.8b, %[b2a].8b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "smull v15.8h, v2.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0a].16b\n"
+ "str q16, [%[c_ptr]]\n"
+ "smlal2 v13.8h, v2.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2a].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3a].16b\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0a].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1a].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smull v14.8h, v3.8b, %[b2a].8b\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "smull v15.8h, v3.8b, %[b3a].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0a].16b\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "smlal2 v13.8h, v3.16b, %[b1a].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2a].16b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "smlal2 v15.8h, v3.16b, %[b3a].16b\n"
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "smull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "smull v15.8h, v0.8b, %[b3].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "smlal2 v12.8h, v0.16b, %[b0].16b\n"
+ "smlal2 v13.8h, v0.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v0.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v0.16b, %[b3].16b\n"
+
+ "sadalp v16.4s, v12.8h\n"
+ "smull v12.8h, v1.8b, %[b0].8b\n"
+ "sadalp v17.4s, v13.8h\n"
+ "sadalp v18.4s, v14.8h\n"
+ "smull v13.8h, v1.8b, %[b1].8b\n"
+ "sadalp v19.4s, v15.8h\n"
+ "smull v14.8h, v1.8b, %[b2].8b\n"
+ "smull v15.8h, v1.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v1.16b, %[b0].16b\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smlal2 v13.8h, v1.16b, %[b1].16b\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smlal2 v14.8h, v1.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v1.16b, %[b3].16b\n"
+
+ "sadalp v20.4s, v12.8h\n"
+ "smull v12.8h, v2.8b, %[b0].8b\n"
+ "sadalp v21.4s, v13.8h\n"
+ "sadalp v22.4s, v14.8h\n"
+ "smull v13.8h, v2.8b, %[b1].8b\n"
+ "sadalp v23.4s, v15.8h\n"
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "smull v14.8h, v2.8b, %[b2].8b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "smull v15.8h, v2.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v2.16b, %[b0].16b\n"
+ "str q16, [%[c_ptr]]\n"
+ "smlal2 v13.8h, v2.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v2.16b, %[b2].16b\n"
+ "smlal2 v15.8h, v2.16b, %[b3].16b\n"
+
+ "sadalp v24.4s, v12.8h\n"
+ "smull v12.8h, v3.8b, %[b0].8b\n"
+ "sadalp v25.4s, v13.8h\n"
+ "sadalp v26.4s, v14.8h\n"
+ "smull v13.8h, v3.8b, %[b1].8b\n"
+ "sadalp v27.4s, v15.8h\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "smull v14.8h, v3.8b, %[b2].8b\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "smull v15.8h, v3.8b, %[b3].8b\n"
+ "smlal2 v12.8h, v3.16b, %[b0].16b\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "smlal2 v13.8h, v3.16b, %[b1].16b\n"
+ "smlal2 v14.8h, v3.16b, %[b2].16b\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "smlal2 v15.8h, v3.16b, %[b3].16b\n"
+
+ "3:\n"
+
+ // Final additions
+ "sadalp v28.4s, v12.8h\n"
+ "str q18, [%[c_ptr], #32]\n"
+ "sadalp v29.4s, v13.8h\n"
+ "sadalp v30.4s, v14.8h\n"
+ "sadalp v31.4s, v15.8h\n"
+
+ // Horizontal reduction, phase 1
+ "addp v22.4s, v28.4s, v29.4s\n"
+ "addp v23.4s, v30.4s, v31.4s\n"
+
+ // Horizontal reduction, phase 2
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "str q19, [%[c_ptr], #48]\n"
+ "add %[c_ptr], %[c_ptr], #64\n"
+
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+ [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a), [b3a] "+w"(b3a),
+ [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index 7eb8b2dacf..39757326f4 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,8 +25,10 @@
#ifdef __aarch64__
+namespace arm_gemm
+{
// Actual kernel implementations
-#include "a64_gemm_u16_12x8/generic.hpp"
+void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
// 12x8 SGEMM "strategy" class.
//
@@ -36,7 +38,8 @@
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class gemm_u16_12x8 {
+class gemm_u16_12x8
+{
public:
typedef uint16_t operand_type;
typedef uint32_t result_type;
@@ -45,24 +48,26 @@ public:
/* Describes the data layout for A input */
static const int A_interleave = 8;
- static const int A_block = 1;
- static const int A_transpose = 0;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
/* Same for B input */
static const int B_interleave = 12;
- static const int B_block = 1;
- static const int B_transpose = 1;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
/* Kernel blocking parameters */
- static const int out_width = 12;
+ static const int out_width = 12;
static const int out_height = 8;
- static const int k_unroll = 1;
+ static const int k_unroll = 1;
- kern_type kernel = nullptr;
+ kern_type kernel = a64_gemm_u16_asimd_12x8;
- gemm_u16_12x8(const CPUInfo *ci) {
- kernel = a64_gemm_u16_asimd_12x8;
+ gemm_u16_12x8(const CPUInfo *ci)
+ {
}
};
+} // namespace arm_gemm
+
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
new file mode 100644
index 0000000000..7903878301
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const uint16_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint16_t *a_ptr0 = a_ptr;
+ const uint16_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ const bool odd_k = K & 0x1;
+ int k = (K + 1) / 2 - 1;
+
+ register uint16x8_t aa asm("v0");
+ register uint16x8_t ab asm("v1");
+ register uint16x8_t b0 asm("v2");
+ register uint16x8_t b1 asm("v3");
+ register uint16x8_t b2 asm("v4");
+
+ __asm __volatile(
+ "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower
+ "movi v5.4s, #0\n"
+ "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper
+ "movi v6.4s, #0\n"
+ "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper
+ "movi v7.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v8.4s, #0\n"
+ "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper
+ "movi v9.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v10.4s, #0\n"
+ "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper
+ "movi v11.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #96]")
+ "movi v12.4s, #0\n"
+ "movi v13.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #96]")
+ "movi v14.4s, #0\n"
+ "movi v15.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0\n"
+ "movi v17.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v18.4s, #0\n"
+ "movi v19.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #160]")
+ "movi v20.4s, #0\n"
+ "movi v21.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #160]")
+ "movi v22.4s, #0\n"
+ "movi v23.4s, #0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v24.4s, #0\n"
+ "add %x[a_ptr], %x[a_ptr], #0x10\n"
+ "movi v25.4s, #0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v26.4s, #0\n"
+ "add %x[b_ptr], %x[b_ptr], #0x18\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+
+ "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations.
+
+ "1:\n" // Main loop
+ // First unroll
+ "smlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper
+ "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ // Second unroll
+ "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower
+ "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper
+ "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper
+ "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "add %x[a_ptr], %x[a_ptr], #0x20\n"
+ "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "subs %x[k], %x[k], #0x1\n"
+ "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower
+ "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper
+ "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "add %x[b_ptr], %x[b_ptr], #0x30\n"
+ "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "bne 1b\n"
+
+ "2:\n" // Even tail
+ "cbnz %x[odd_k], 3f\n"
+
+ "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper
+ "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower
+ "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper
+ "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper
+ "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower
+ "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper
+ "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper
+ "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "add %[a_ptr], %[a_ptr], #0x10\n"
+ "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "add %[b_ptr], %[b_ptr], #0x18\n"
+ "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper
+ "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+
+ "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n"
+ "umlal v13.4s, %[b2].4h, %[ab].h[0]\n"
+ "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n"
+ "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n"
+ "umlal v14.4s, %[b2].4h, %[ab].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "umlal v15.4s, %[b2].4h, %[ab].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "umlal v16.4s, %[b2].4h, %[ab].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "umlal v17.4s, %[b2].4h, %[ab].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "umlal v18.4s, %[b2].4h, %[ab].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "umlal v19.4s, %[b2].4h, %[ab].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "umlal v20.4s, %[b2].4h, %[ab].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+ "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n"
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "b 4f\n" // Complete write out
+
+ "3:\n" // Odd tail
+ "umlal v5.4s, %[b0].4h, %[aa].h[0]\n"
+ "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n"
+ "umlal v21.4s, %[b1].4h, %[aa].h[0]\n"
+ "umlal v6.4s, %[b0].4h, %[aa].h[1]\n"
+ "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n"
+ "umlal v22.4s, %[b1].4h, %[aa].h[1]\n"
+ "str q5, [%x[c_ptr]]\n"
+ "umlal v7.4s, %[b0].4h, %[aa].h[2]\n"
+ "str q13, [%x[c_ptr], #0x10]\n"
+ "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n"
+ "str q21, [%x[c_ptr], #0x20]\n"
+ "umlal v23.4s, %[b1].4h, %[aa].h[2]\n"
+ "str q6, [%x[c_ptr], #0x30]\n"
+ "umlal v8.4s, %[b0].4h, %[aa].h[3]\n"
+ "str q14, [%x[c_ptr], #0x40]\n"
+ "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n"
+ "str q22, [%x[c_ptr], #0x50]\n"
+ "umlal v24.4s, %[b1].4h, %[aa].h[3]\n"
+ "str q7, [%x[c_ptr], #0x60]\n"
+ "umlal v9.4s, %[b0].4h, %[aa].h[4]\n"
+ "str q15, [%x[c_ptr], #0x70]\n"
+ "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n"
+ "str q23, [%x[c_ptr], #0x80]\n"
+ "umlal v25.4s, %[b1].4h, %[aa].h[4]\n"
+ "str q8, [%x[c_ptr], #0x90]\n"
+ "umlal v10.4s, %[b0].4h, %[aa].h[5]\n"
+ "str q16, [%x[c_ptr], #0xa0]\n"
+ "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n"
+ "str q24, [%x[c_ptr], #0xb0]\n"
+ "umlal v26.4s, %[b1].4h, %[aa].h[5]\n"
+ "str q9, [%x[c_ptr], #0xc0]\n"
+ "umlal v11.4s, %[b0].4h, %[aa].h[6]\n"
+ "str q17, [%x[c_ptr], #0xd0]\n"
+ "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n"
+ "str q25, [%x[c_ptr], #0xe0]\n"
+ "umlal v27.4s, %[b1].4h, %[aa].h[6]\n"
+ "str q10, [%x[c_ptr], #0xf0]\n"
+ "umlal v12.4s, %[b0].4h, %[aa].h[7]\n"
+ "str q18, [%x[c_ptr], #0x100]\n"
+ "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n"
+ "str q26, [%x[c_ptr], #0x110]\n"
+ "umlal v28.4s, %[b1].4h, %[aa].h[7]\n"
+ "str q11, [%x[c_ptr], #0x120]\n"
+
+ "4:\n" // End of function
+ "str q19, [%x[c_ptr], #0x130]\n"
+ "str q27, [%x[c_ptr], #0x140]\n"
+ "str q12, [%x[c_ptr], #0x150]\n"
+ "str q20, [%x[c_ptr], #0x160]\n"
+ "str q28, [%x[c_ptr], #0x170]\n"
+ "add %x[c_ptr], %x[c_ptr], #0x180\n"
+ : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr), [k] "+r"(k),
+ [aa] "+w"(aa), [ab] "+w"(ab), [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2)
+ : [odd_k] "r"(odd_k)
+ : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index 62cd747d7c..26255b14bf 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,41 +25,48 @@
#ifdef __aarch64__
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
// Load the actual kernel
-#include "a64_gemm_u8_12x8/generic.hpp"
-#include "a64_gemm_u8_12x8/a55r1.hpp"
+void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class gemm_u8_12x8 {
+class gemm_u8_12x8
+{
public:
- typedef uint8_t operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
/* Describes the data layout for A input */
- static const int A_interleave = 8;
- static const int A_block = 4;
- static const bool A_transpose = false;
+ static const int A_interleave = 8;
+ static const int A_block = 4;
+ static const bool A_transpose = false;
/* Same for B input */
- static const int B_interleave = 12;
- static const int B_block = 4;
- static const bool B_transpose = true;
+ static const int B_interleave = 12;
+ static const int B_block = 4;
+ static const bool B_transpose = true;
/* Kernel blocking parameters */
- static const int out_width = 12;
+ static const int out_width = 12;
static const int out_height = 8;
- static const int k_unroll = 4;
+ static const int k_unroll = 4;
- kern_type kernel = nullptr;
+ kern_type kernel = a64_gemm_u8_12x8;
- gemm_u8_12x8(const CPUInfo *ci) {
- kernel = a64_gemm_u8_12x8;
- if (ci->CPU == CPUTarget::A55_DOT) {
+ gemm_u8_12x8(const CPUInfo *ci)
+ {
+ if(ci->get_cpu_model() == CPUModel::A55r1)
+ {
kernel = a64_gemm_u8_12x8_a55r1;
}
}
};
-#endif // __aarch64__
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
new file mode 100644
index 0000000000..f8fafbdf84
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+
+ // We divide K by 4 because the udot instruction processes 4 elements at a time.
+ const int W = K / 4;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int k_iters = ((W + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = k_iters;
+
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_UDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "b.ne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "b 3f\n"
+
+ // Odd K continuation
+ "2:\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem udot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000000..5ee273bd74
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_UDOT \
+ ".altmacro\n" \
+ ".macro udot opd:req, opn:req, opm:req\n" \
+ "local vd, vn, vm, h, l\n" \
+ ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n" \
+ ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n" \
+ ".set vd,\\reg\n" \
+ ".endif\n" \
+ ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n" \
+ ".set vn,\\reg\n" \
+ ".endif\n" \
+ ".irp idx,0,1,2,3\n" \
+ ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n" \
+ ".set vm,\\reg\n" \
+ ".set h,\\idx / 2\n" \
+ ".set l,\\idx %% 2\n" \
+ ".endif\n" \
+ ".endr\n" \
+ ".endr\n" \
+ ".ifndef vd\n" \
+ ".error \"Bad operand \\opd\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vn\n" \
+ ".error \"Bad operand \\opn\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef vm\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef h\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".ifndef l\n" \
+ ".error \"Bad operand \\opm\"\n" \
+ ".exitm\n" \
+ ".endif\n" \
+ ".int 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n" \
+ ".endm\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
new file mode 100644
index 0000000000..d026dc54f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+#include "dot_toolchain_support.h"
+#endif
+
+namespace arm_gemm
+{
+void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+ // We divide K by 4 because the udot instruction processes 4 elements at a time.
+ const int W = K / 4;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ const int oddk = (W & 1);
+ const int init_value_k = ((W + 1) / 2) - 1;
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = init_value_k;
+ register uint8x16_t a0 asm("v0");
+ register uint8x16_t a1 asm("v1");
+ register uint8x16_t b0 asm("v2");
+ register uint8x16_t b1 asm("v3");
+ register uint8x16_t b2 asm("v4");
+ register uint8x16_t a0a asm("v5");
+ register uint8x16_t a1a asm("v6");
+ __asm __volatile(
+#ifdef NO_DOT_IN_TOOLCHAIN
+ _DECLARE_UDOT
+#else
+ ".arch armv8.2-a+dotprod\n"
+#endif
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+ "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+#ifdef NO_DOT_IN_TOOLCHAIN
+ ".purgem udot\n"
+#endif
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 3561bfec96..5aa5291a29 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,37 +25,42 @@
#ifdef __aarch64__
-// Load the actual kernel
-#include "a64_gemm_u8_4x4/generic.hpp"
+namespace arm_gemm
+{
+// Kernel definition
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
-class gemm_u8_4x4 {
+class gemm_u8_4x4
+{
public:
- typedef uint8_t operand_type;
+ typedef uint8_t operand_type;
typedef uint32_t result_type;
typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
/* Describes the data layout for A input */
- static const int A_interleave = 4;
- static const int A_block = 16;
- static const bool A_transpose = false;
+ static const int A_interleave = 4;
+ static const int A_block = 16;
+ static const bool A_transpose = false;
/* Same for B input */
- static const int B_interleave = 4;
- static const int B_block = 16;
- static const bool B_transpose = true;
+ static const int B_interleave = 4;
+ static const int B_block = 16;
+ static const bool B_transpose = true;
/* Kernel blocking parameters */
- static const int out_width = 4;
+ static const int out_width = 4;
static const int out_height = 4;
- static const int k_unroll = 16;
+ static const int k_unroll = 16;
kern_type kernel = nullptr;
- gemm_u8_4x4(const CPUInfo *ci) {
+ gemm_u8_4x4(const CPUInfo *ci)
+ {
kernel = a64_gemm_u8_4x4;
}
};
-#endif // __aarch64__
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
new file mode 100644
index 0000000000..0a881ffde3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+{
+ const uint8_t *a_ptr = Apanel;
+ uint32_t *c_ptr = Cpanel;
+ K /= 16;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const uint8_t *a_ptr0 = a_ptr;
+ const uint8_t *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+
+ int k = K - 1;
+
+ register uint8x16_t b0 asm("v4");
+ register uint8x16_t b1 asm("v5");
+ register uint8x16_t b2 asm("v6");
+ register uint8x16_t b3 asm("v7");
+
+ __asm __volatile(
+ "movi v16.4s, #0x0\n"
+ "ldr q0, [%[a_ptr]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "movi v24.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v26.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v27.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v28.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]") "movi v30.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]") "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+
+ "umull v12.8h, v0.8b, %[b0].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "umull v13.8h, v0.8b, %[b1].8b\n"
+ "umull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "umull v15.8h, v0.8b, %[b3].8b\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 2f\n"
+
+ "1:\n"
+ "uadalp v16.4s, v12.8h\n"
+ "umull2 v12.8h, v0.16b, %[b0].16b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull2 v13.8h, v0.16b, %[b1].16b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull2 v14.8h, v0.16b, %[b2].16b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull2 v15.8h, v0.16b, %[b3].16b\n"
+ "ldr q0, [%[a_ptr]]\n"
+
+ "uadalp v16.4s, v12.8h\n"
+ "umull v12.8h, v1.8b, %[b0].8b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull v13.8h, v1.8b, %[b1].8b\n"
+ "subs %w[k], %w[k], #1\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull v14.8h, v1.8b, %[b2].8b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull v15.8h, v1.8b, %[b3].8b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull2 v12.8h, v1.16b, %[b0].16b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull2 v13.8h, v1.16b, %[b1].16b\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "uadalp v22.4s, v14.8h\n"
+ "umull2 v14.8h, v1.16b, %[b2].16b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull2 v15.8h, v1.16b, %[b3].16b\n"
+ "ldr q1, [%[a_ptr], #16]\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull v12.8h, v2.8b, %[b0].8b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull v13.8h, v2.8b, %[b1].8b\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "uadalp v22.4s, v14.8h\n"
+ "umull v14.8h, v2.8b, %[b2].8b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull v15.8h, v2.8b, %[b3].8b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull2 v12.8h, v2.16b, %[b0].16b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull2 v13.8h, v2.16b, %[b1].16b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull2 v14.8h, v2.16b, %[b2].16b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull2 v15.8h, v2.16b, %[b3].16b\n"
+ "ldr q2, [%[a_ptr], #32]\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull v12.8h, v3.8b, %[b0].8b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull v13.8h, v3.8b, %[b1].8b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull v14.8h, v3.8b, %[b2].8b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull v15.8h, v3.8b, %[b3].8b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull2 v12.8h, v3.16b, %[b0].16b\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull2 v13.8h, v3.16b, %[b1].16b\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull2 v14.8h, v3.16b, %[b2].16b\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull2 v15.8h, v3.16b, %[b3].16b\n"
+ "ldr %q[b3], [%[b_ptr], #48]\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull v12.8h, v0.8b, %[b0].8b\n"
+ "add %[b_ptr], %[b_ptr], #64\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull v13.8h, v0.8b, %[b1].8b\n"
+ "ldr q3, [%[a_ptr], #48]\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull v14.8h, v0.8b, %[b2].8b\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull v15.8h, v0.8b, %[b3].8b\n"
+ "bne 1b\n"
+
+ // Branch target
+ "2:\n"
+ "uadalp v16.4s, v12.8h\n"
+ "umull2 v12.8h, v0.16b, %[b0].16b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull2 v13.8h, v0.16b, %[b1].16b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull2 v14.8h, v0.16b, %[b2].16b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull2 v15.8h, v0.16b, %[b3].16b\n"
+
+ "uadalp v16.4s, v12.8h\n"
+ "umull v12.8h, v1.8b, %[b0].8b\n"
+ "uadalp v17.4s, v13.8h\n"
+ "umull v13.8h, v1.8b, %[b1].8b\n"
+ "uadalp v18.4s, v14.8h\n"
+ "umull v14.8h, v1.8b, %[b2].8b\n"
+ "uadalp v19.4s, v15.8h\n"
+ "umull v15.8h, v1.8b, %[b3].8b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull2 v12.8h, v1.16b, %[b0].16b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull2 v13.8h, v1.16b, %[b1].16b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull2 v14.8h, v1.16b, %[b2].16b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull2 v15.8h, v1.16b, %[b3].16b\n"
+
+ "uadalp v20.4s, v12.8h\n"
+ "umull v12.8h, v2.8b, %[b0].8b\n"
+ "uadalp v21.4s, v13.8h\n"
+ "umull v13.8h, v2.8b, %[b1].8b\n"
+ "uadalp v22.4s, v14.8h\n"
+ "umull v14.8h, v2.8b, %[b2].8b\n"
+ "uadalp v23.4s, v15.8h\n"
+ "umull v15.8h, v2.8b, %[b3].8b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull2 v12.8h, v2.16b, %[b0].16b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull2 v13.8h, v2.16b, %[b1].16b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull2 v14.8h, v2.16b, %[b2].16b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull2 v15.8h, v2.16b, %[b3].16b\n"
+
+ "uadalp v24.4s, v12.8h\n"
+ "umull v12.8h, v3.8b, %[b0].8b\n"
+ "uadalp v25.4s, v13.8h\n"
+ "umull v13.8h, v3.8b, %[b1].8b\n"
+ "uadalp v26.4s, v14.8h\n"
+ "umull v14.8h, v3.8b, %[b2].8b\n"
+ "uadalp v27.4s, v15.8h\n"
+ "umull v15.8h, v3.8b, %[b3].8b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "umull2 v12.8h, v3.16b, %[b0].16b\n"
+ "uadalp v29.4s, v13.8h\n"
+ "umull2 v13.8h, v3.16b, %[b1].16b\n"
+ "uadalp v30.4s, v14.8h\n"
+ "umull2 v14.8h, v3.16b, %[b2].16b\n"
+ "uadalp v31.4s, v15.8h\n"
+ "umull2 v15.8h, v3.16b, %[b3].16b\n"
+
+ "uadalp v28.4s, v12.8h\n"
+ "uadalp v29.4s, v13.8h\n"
+ "uadalp v30.4s, v14.8h\n"
+ "uadalp v31.4s, v15.8h\n"
+
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+ "addp v20.4s, v24.4s, v25.4s\n"
+ "addp v21.4s, v26.4s, v27.4s\n"
+ "addp v22.4s, v28.4s, v29.4s\n"
+ "addp v23.4s, v30.4s, v31.4s\n"
+
+ "addp v16.4s, v16.4s, v17.4s\n"
+ "addp v17.4s, v18.4s, v19.4s\n"
+ "addp v18.4s, v20.4s, v21.4s\n"
+ "addp v19.4s, v22.4s, v23.4s\n"
+
+ "str q16, [%[c_ptr]]\n"
+ "str q17, [%[c_ptr], #16]\n"
+ "str q18, [%[c_ptr], #32]\n"
+ "str q19, [%[c_ptr], #48]\n"
+ "add %[c_ptr], %[c_ptr], #64\n"
+
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [b3] "+w"(b3),
+ [k] "+r"(k)
+ :
+ : "x20", "x21", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 5e7684f692..77ec59aa35 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,45 +23,52 @@
*/
#pragma once
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-// Get the components we need to implement SGEMM.
-// Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time.
-#include "a64_hgemm_24x8/generic.hpp"
-#include "a64_hgemm_24x8/a55r1.hpp"
+#include "arm_gemm.hpp"
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
// 24x8 HGEMM "strategy" class. Describes the kernel properties.
//
// The generic "gemm_opt" function will instantiate one of these (allowing
// the constructor to pick a kernel implementation).
-class hgemm_24x8 {
+class hgemm_24x8
+{
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
- static const int A_block = 1;
- static const int A_interleave = 8;
- static const bool A_transpose = false;
+ static const int A_block = 1;
+ static const int A_interleave = 8;
+ static const bool A_transpose = false;
- static const int B_block = 1;
- static const int B_interleave = 24;
- static const bool B_transpose = true;
+ static const int B_block = 1;
+ static const int B_interleave = 24;
+ static const bool B_transpose = true;
- static const int out_width = 24;
+ static const int out_width = 24;
static const int out_height = 8;
- static const int k_unroll = 1;
+ static const int k_unroll = 1;
- kern_type kernel = nullptr;
+ // Default to the generic kernel
+ kern_type kernel = a64_hgemm_asimd_24x8;
- hgemm_24x8(const struct CPUInfo *ci) {
- kernel = a64_hgemm_asimd_24x8;
- if (ci->CPU == CPUTarget::A55_DOT) {
+ hgemm_24x8(const CPUInfo *ci)
+ {
+ if(ci->get_cpu_model() == CPUModel::A55r1)
+ {
kernel = a64_hgemm_asimd_24x8_a55r1;
}
}
-
};
-#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
new file mode 100644
index 0000000000..d59618dd54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+ const __fp16 *a_ptr = Apanel;
+ __fp16 *c_ptr = Cpanel;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k_iters = ((K + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const __fp16 *a_ptr0 = a_ptr;
+ const __fp16 *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ int k = k_iters;
+ a_ptr = a_ptr0;
+
+ // As A55 requires 64-bit loads anyway, just use 64 bits of the
+ // "A" operands to save on "ins" instructions. Since A55 is
+ // in-order, two sets of "A" operands and one set of "B" is
+ // sufficient.
+ register float16x8_t a0 asm("v0");
+ register float16x8_t a1 asm("v1");
+ register float16x8_t a0a asm("v2");
+ register float16x8_t a1a asm("v3");
+ register float16x8_t b0 asm("v4");
+ register float16x8_t b1 asm("v5");
+ register float16x8_t b2 asm("v6");
+
+ __asm __volatile(
+ // Enable FP16 extensions
+ ".arch armv8.2-a+fp16\n"
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.8h, #0x0\n"
+ "ldr %d[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %d[a1], [%[a_ptr], #8]\n"
+ "movi v11.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.8h, #0x0\n"
+ "movi v13.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v14.8h, #0x0\n"
+ "movi v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v16.8h, #0x0\n"
+ "movi v17.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v18.8h, #0x0\n"
+ "movi v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #16]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n"
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #24]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ // Unroll 1
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n"
+ "ldr %d[a0], [%[a_ptr], #32]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n"
+ "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
+ "ldr %d[a1], [%[a_ptr], #40]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n"
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n"
+
+ "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "bne 1b\n"
+
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #16]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #24]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1].h[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.8h , %[b0].8h, %[a0a].h[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.8h , %[b0].8h, %[a0a].h[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.8h, %[b0].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0a].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+ "fmla v12.8h, %[b0].8h, %[a1a].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1a].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v14.8h, %[b0].8h, %[a1a].h[2]\n"
+ "fmla v15.8h, %[b0].8h, %[a1a].h[3]\n"
+ "ldr %d[a1], [%[a_ptr], #40]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0a].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v17.8h, %[b1].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v18.8h, %[b1].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+ "fmla v20.8h, %[b1].8h, %[a1a].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "fmla v22.8h, %[b1].8h, %[a1a].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+
+ "fmla v24.8h, %[b2].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0a].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "fmla v26.8h, %[b2].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0a].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+ "fmla v28.8h, %[b2].8h, %[a1a].h[0]\n"
+ "fmla v29.8h, %[b2].8h, %[a1a].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.8h, %[b2].8h, %[a1a].h[2]\n"
+ "fmla v31.8h, %[b2].8h, %[a1a].h[3]\n"
+ "b 3f\n"
+
+ "2:\n"
+
+ // Odd tail
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+
+ "fmla v12.8h, %[b0].8h, %[a1].h[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.8h, %[b0].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v14.8h, %[b0].8h, %[a1].h[2]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "fmla v15.8h, %[b0].8h, %[a1].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+
+ "fmla v20.8h, %[b1].8h, %[a1].h[0]\n"
+ "fmla v21.8h, %[b1].8h, %[a1].h[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v22.8h, %[b1].8h, %[a1].h[2]\n"
+ "fmla v23.8h, %[b1].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+
+ "fmla v28.8h, %[b2].8h, %[a1].h[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla v29.8h, %[b2].8h, %[a1].h[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla v30.8h, %[b2].8h, %[a1].h[2]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla v31.8h, %[b2].8h, %[a1].h[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+
+ // Common tail
+ // A55 won't dual issue these stores with anything else, so
+ // simplest to do them all in this common code.
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "5:\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "=w"(a0), [a0a] "=w"(a0a), [a1] "=w"(a1), [a1a] "=w"(a1a),
+ [b0] "=w"(b0), [b1] "=w"(b1), [b2] "=w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
new file mode 100644
index 0000000000..468d603484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K)
+{
+ const __fp16 *a_ptr = Apanel;
+ __fp16 *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const __fp16 *a_ptr0 = a_ptr;
+ const __fp16 *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float16x8_t a0 asm("v0");
+ register float16x8_t a0a asm("v1");
+ register float16x8_t b0 asm("v2");
+ register float16x8_t b1 asm("v3");
+ register float16x8_t b2 asm("v4");
+ register float16x8_t b0a asm("v5");
+ register float16x8_t b1a asm("v6");
+ register float16x8_t b2a asm("v7");
+
+ __asm __volatile(
+ ".arch armv8.2-a+fp16\n"
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.8h, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.8h, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.8h, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v11.8h, #0x0\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "movi v12.8h, #0x0\n"
+ "ldr %q[b0a], [%[b_ptr], #48]\n"
+ "movi v13.8h, #0x0\n"
+ "ldr %q[b1a], [%[b_ptr], #64]\n"
+ "movi v14.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v15.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v16.8h, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v17.8h, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]") "movi v18.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v19.8h, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.8h, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v22.8h, #0x0\n"
+ "movi v23.8h, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v25.8h, #0x0\n"
+ "movi v26.8h, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v28.8h, #0x0\n"
+ "movi v29.8h, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v31.8h, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2a], [%[b_ptr], #80]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[a_ptr], #128]")
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" ASM_PREFETCH("[%[b_ptr], #288]")
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+ "ldr %q[a0], [%[a_ptr], #32]\n"
+
+ "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+ "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+ "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+ "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+ "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+ "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+ "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+ "ldr %q[b0a], [%[b_ptr], #48]\n"
+
+ "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+ "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" ASM_PREFETCH("[%[b_ptr], #352]")
+ "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+ "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+ "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+ "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+ "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+ "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+ "ldr %q[b1a], [%[b_ptr], #64]\n"
+
+ "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+ "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+ "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+ "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+ "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+ "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+
+ "bne 1b\n"
+ "4:\n"
+
+ // Jump to odd tail if necessary.
+ "cbnz %w[oddk], 2f\n"
+
+ // Even tail.
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "ldr %q[a0a], [%[a_ptr], #16]\n"
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "ldr %q[b2a], [%[b_ptr], #80]\n"
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+ "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n"
+ "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n"
+ "b 3f\n"
+
+ // Odd tail
+ "2:\n"
+ "fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
+ "add %[a_ptr], %[a_ptr], #16\n"
+ "str q8, [%[c_ptr]]\n"
+ "fmla v24.8h, %[b2].8h, %[a0].h[0]\n"
+ "str q16, [%[c_ptr], #16]\n"
+
+ "fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v25.8h, %[b2].8h, %[a0].h[1]\n"
+ "str q17, [%[c_ptr], #64]\n"
+
+ "fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "fmla v26.8h, %[b2].8h, %[a0].h[2]\n"
+ "str q18, [%[c_ptr], #112]\n"
+
+ "fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "fmla v27.8h, %[b2].8h, %[a0].h[3]\n"
+ "str q19, [%[c_ptr], #160]\n"
+
+ "fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "fmla v28.8h, %[b2].8h, %[a0].h[4]\n"
+ "str q20, [%[c_ptr], #208]\n"
+
+ "fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "fmla v29.8h, %[b2].8h, %[a0].h[5]\n"
+ "str q21, [%[c_ptr], #256]\n"
+
+ "fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "fmla v30.8h, %[b2].8h, %[a0].h[6]\n"
+ "str q22, [%[c_ptr], #304]\n"
+
+ "fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "fmla v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a0a] "+w"(a0a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k),
+ [b0a] "+w"(b0a), [b1a] "+w"(b1a), [b2a] "+w"(b2a)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index 603ad8dc0a..91a9e8de60 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,12 +25,13 @@
#ifdef __aarch64__
+namespace arm_gemm
+{
// Actual kernel implementations
-#include "a64_sgemm_12x8/generic.hpp"
-#include "a64_sgemm_12x8/a53.hpp"
-#include "a64_sgemm_12x8/a55.hpp"
-#include "a64_sgemm_12x8/a55r1.hpp"
-
+void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
// 12x8 SGEMM "strategy" class.
//
@@ -40,7 +41,8 @@
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class sgemm_12x8 {
+class sgemm_12x8
+{
public:
typedef float operand_type;
typedef float result_type;
@@ -49,33 +51,45 @@ public:
/* Describes the data layout for A input */
static const int A_interleave = 8;
- static const int A_block = 1;
- static const int A_transpose = 0;
+ static const int A_block = 1;
+ static const int A_transpose = 0;
/* Same for B input */
static const int B_interleave = 12;
- static const int B_block = 1;
- static const int B_transpose = 1;
+ static const int B_block = 1;
+ static const int B_transpose = 1;
/* Kernel blocking parameters */
- static const int out_width = 12;
+ static const int out_width = 12;
static const int out_height = 8;
- static const int k_unroll = 1;
+ static const int k_unroll = 1;
- kern_type kernel{nullptr};
+ kern_type kernel = a64_sgemm_asimd_12x8;
- sgemm_12x8(const CPUInfo *ci) {
- kernel = a64_sgemm_asimd_12x8;
- if (ci->CPU == CPUTarget::A53) {
- kernel = a64_sgemm_asimd_12x8_a53;
- }
- else if (ci->CPU == CPUTarget::A55) {
- kernel = a64_sgemm_asimd_12x8_a55;
- }
- else if (ci->CPU == CPUTarget::A55_DOT) {
- kernel = a64_sgemm_asimd_12x8_a55r1;
+ sgemm_12x8(const CPUInfo *ci)
+ {
+ // Select specific kernel if available
+ switch(ci->get_cpu_model())
+ {
+ case CPUModel::A53:
+ kernel = a64_sgemm_asimd_12x8_a53;
+ break;
+
+ case CPUModel::A55r0:
+ kernel = a64_sgemm_asimd_12x8_a55;
+ break;
+
+ case CPUModel::A55r1:
+ kernel = a64_sgemm_asimd_12x8_a55r1;
+ break;
+
+ default:
+ /* Generic kernel is initialized by default. */
+ break;
}
}
};
+} // namespace arm_gemm
+
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
new file mode 100644
index 0000000000..618ebc733c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+ "nop\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+
+ "nop\n"
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+
+ "nop\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+ "bne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration. (even K)
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ins %[b0].d[1], x20\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+
+ "nop\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "nop\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+
+ "ins %[b1].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Detached final iteration. (odd K)
+ "2:\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "nop\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
new file mode 100644
index 0000000000..4ca25eb5ba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "subs %w[k], %w[k], #1\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+ "ins %[a0].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+ "ins %[a1].d[1], x20\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "bne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+ "cbnz %w[oddk], 2f\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ // Detached final iteration. (even K)
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+ "ins %[a0a].d[1], x20\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+ "ins %[a1a].d[1], x20\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+ "ins %[b0].d[1], x20\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+ "ins %[b1].d[1], x20\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+
+ "ins %[b2].d[1], x20\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Detached final iteration. (odd K)
+ "2:\n"
+
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
new file mode 100644
index 0000000000..89fe6ac7ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k_iters = ((K + 1) / 2) - 1;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ int k = k_iters;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #512]")
+
+ // The loop is offset by these two instructions which must
+ // always be executed.
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #448]")
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" ASM_PREFETCH("[%[b_ptr], #576]")
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Unroll 1
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "ldr %d[a0], [%[a_ptr], #64]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[a_ptr], #72]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %d[a1], [%[a_ptr], #80]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "ins %[a0].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[a_ptr], #88]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #96]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "ins %[a1].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "ldr x20, [%[b_ptr], #104]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #120]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCH("[%[b_ptr], #640]")
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "b.ne 1b\n"
+
+ // Branch here if K=1 or 2. Do the right thing for odd/even at the end.
+ "4:\n"
+
+ // Start final iteration - branch off to "odd" code before we load a0a.
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "cbnz %w[oddk], 2f\n"
+
+ // Even K continuation
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %d[a0a], [%[a_ptr], #32]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr x20, [%[a_ptr], #40]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %d[a1a], [%[a_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "ins %[a0a].d[1], x20\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "ldr x20, [%[a_ptr], #56]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "ldr %d[b0], [%[b_ptr], #48]\n"
+
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "ins %[a1a].d[1], x20\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #56]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "ldr %d[b1], [%[b_ptr], #64]\n"
+
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "ins %[b0].d[1], x20\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "ldr x20, [%[b_ptr], #72]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %d[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "ins %[b1].d[1], x20\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr x20, [%[b_ptr], #88]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "ins %[b2].d[1], x20\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]")
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]")
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #640]")
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "b 3f\n"
+
+ // Odd K continuation
+ "2:\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr]]")
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "ins %[b2].d[1], x20\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #64]")
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #128]")
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #192]")
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" ASM_PREFETCHW("[%[c_ptr], #256]")
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" ASM_PREFETCHW("[%[c_ptr], #320]")
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #384]")
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCHWL2("[%[c_ptr], #448]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" ASM_PREFETCHWL2("[%[c_ptr], #512]") "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" ASM_PREFETCHWL2("[%[c_ptr], #576]") "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ ASM_PREFETCHWL2("[%[c_ptr], #640]") "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" ASM_PREFETCHWL2("[%[c_ptr], #704]")
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+
+ // Common tail
+ "3:\n"
+ "str q8, [%[c_ptr]]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "str q10, [%[c_ptr], #96]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "str q11, [%[c_ptr], #144]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "str q12, [%[c_ptr], #192]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "str q13, [%[c_ptr], #240]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "str q14, [%[c_ptr], #288]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "str q15, [%[c_ptr], #336]\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
new file mode 100644
index 0000000000..42e870e814
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm
+{
+void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K, long int row_jump = 0, long int block_jump = 0)
+{
+ const float *a_ptr = Apanel;
+ float *c_ptr = Cpanel;
+
+ for(int yb = 0; yb < ablocks; yb++)
+ {
+ const float *a_ptr0 = a_ptr;
+ const float *b_ptr = Bpanel;
+
+ for(int xb = 0; xb < bblocks; xb++)
+ {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K + 1) / 2) - 1;
+
+ register float32x4_t a0 asm("v0");
+ register float32x4_t a1 asm("v1");
+ register float32x4_t b0 asm("v2");
+ register float32x4_t b1 asm("v3");
+ register float32x4_t b2 asm("v4");
+ register float32x4_t a0a asm("v5");
+ register float32x4_t a1a asm("v6");
+
+ __asm __volatile(
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #64]") "movi v13.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #64]") "movi v14.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #128]") "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]") "movi v16.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #192]") "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #256]") "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]") "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[b_ptr], #320]") "movi v20.4s, #0x0\n" ASM_PREFETCH("[%[a_ptr], #256]") "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ // Loop proper
+ "1:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[a_ptr], #320]")
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" ASM_PREFETCH("[%[b_ptr], #448]")
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" ASM_PREFETCH("[%[b_ptr], #512]")
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[block_jump]\n"
+ "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "fmla v8.4s , %[b0].4s, %[a0].s[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "fmla v16.4s, %[b1].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "fmla v9.4s , %[b0].4s, %[a0].s[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "fmla v17.4s, %[b1].4s, %[a0].s[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "fmla v24.4s, %[b2].4s, %[a0].s[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "fmla v25.4s, %[b2].4s, %[a0].s[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "fmla v10.4s, %[b0].4s, %[a0].s[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "fmla v18.4s, %[b1].4s, %[a0].s[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "fmla v26.4s, %[b2].4s, %[a0].s[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "fmla v11.4s, %[b0].4s, %[a0].s[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "fmla v19.4s, %[b1].4s, %[a0].s[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "fmla v27.4s, %[b2].4s, %[a0].s[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "fmla v12.4s, %[b0].4s, %[a1].s[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "fmla v20.4s, %[b1].4s, %[a1].s[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "fmla v28.4s, %[b2].4s, %[a1].s[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "fmla v13.4s, %[b0].4s, %[a1].s[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "fmla v21.4s, %[b1].4s, %[a1].s[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "fmla v29.4s, %[b2].4s, %[a1].s[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "fmla v14.4s, %[b0].4s, %[a1].s[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "fmla v22.4s, %[b1].4s, %[a1].s[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "fmla v30.4s, %[b2].4s, %[a1].s[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "fmla v15.4s, %[b0].4s, %[a1].s[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "fmla v23.4s, %[b1].4s, %[a1].s[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "fmla v31.4s, %[b2].4s, %[a1].s[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+ :
+ [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr] "+r"(c_ptr),
+ [a0] "+w"(a0), [a1] "+w"(a1), [a0a] "+w"(a0a), [a1a] "+w"(a1a),
+ [b0] "+w"(b0), [b1] "+w"(b1), [b2] "+w"(b2), [k] "+r"(k)
+ : [oddk] "r"(oddk), [row_jump] "r"(row_jump), [block_jump] "r"(block_jump)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+ }
+}
+
+void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K)
+{
+ a64_sgemm_asimd_12x8_jumps(Apanel, Bpanel, Cpanel, ablocks, bblocks, K, 0, 0);
+}
+
+} // namespace arm_gemm
+
+#endif \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
new file mode 100644
index 0000000000..eceacc9031
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4.hpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemm_native_16x4(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+// 12x8 SGEMM "strategy" class.
+//
+// This describes the characteristics of a family of kernels, in terms of
+// the required interleave properties and the output block size.
+//
+// All kernels in the family must share these characteristics. The actual
+// kernel to be used can be chosen at runtime, based on the CPU_type
+// structure.
+class sgemm_native_16x4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, int, float *, int, float, int, int, int);
+
+ /* Kernel blocking parameters */
+ static const int out_width = 16;
+ static const int out_height = 4;
+ static const int k_unroll = 1;
+
+ // Default to the generic kernel
+ kern_type kernel = a64_sgemm_native_16x4;
+
+ sgemm_native_16x4(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
new file mode 100644
index 0000000000..1b5787ce7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_native_16x4/generic.cpp
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+namespace arm_gemm
+{
+void a64_sgemm_native_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, float beta, int M, int N, int K)
+{
+ int oddk = (K % 8) ? 1 : 0;
+ int beta0 = (beta == 0.0f) ? 1 : 0;
+
+ /* For now, very naive with no blocking */
+ for(int y = 0; y < M; y += 4)
+ {
+ for(int x0 = 0; x0 < N; x0 += 16)
+ {
+ const float *a_ptr0 = A + (y * lda);
+ const float *a_ptr1 = a_ptr0 + lda;
+ const float *a_ptr2 = a_ptr1 + lda;
+ const float *a_ptr3 = a_ptr2 + lda;
+
+ const float *b_ptr = B + x0;
+
+ float *c_ptr0 = C + (y * ldc) + x0;
+ float *c_ptr1 = c_ptr0 + ldc;
+ float *c_ptr2 = c_ptr1 + ldc;
+ float *c_ptr3 = c_ptr2 + ldc;
+
+ int loops = ((K + 4) / 8) - 1;
+
+ size_t ldbb = ldb * sizeof(float);
+
+ __asm __volatile(
+ "a0 .req v0\n"
+ "a1 .req v1\n"
+ "a2 .req v2\n"
+ "a3 .req v3\n"
+ "a0a .req v4\n"
+ "a1a .req v5\n"
+ "a2a .req v6\n"
+ "a3a .req v7\n"
+ "bb0 .req v8\n"
+ "bb1 .req v9\n"
+ "bb2 .req v10\n"
+ "bb3 .req v11\n"
+ "b0a .req v12\n"
+ "b1a .req v13\n"
+ "b2a .req v14\n"
+ "b3a .req v15\n"
+
+ "a0q .req q0\n"
+ "a1q .req q1\n"
+ "a2q .req q2\n"
+ "a3q .req q3\n"
+ "a0aq .req q4\n"
+ "a1aq .req q5\n"
+ "a2aq .req q6\n"
+ "a3aq .req q7\n"
+ "b0q .req q8\n"
+ "b1q .req q9\n"
+ "b2q .req q10\n"
+ "b3q .req q11\n"
+ "b0aq .req q12\n"
+ "b1aq .req q13\n"
+ "b2aq .req q14\n"
+ "b3aq .req q15\n"
+
+ "movi v16.4s, #0x0\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "movi v17.4s, #0x0\n"
+ "ldr b0q, [%[b_ptr]]\n"
+ "movi v18.4s, #0x0\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+ "movi v19.4s, #0x0\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+ "movi v20.4s, #0x0\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+ "movi v21.4s, #0x0\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "movi v22.4s, #0x0\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "movi v23.4s, #0x0\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "movi v24.4s, #0x0\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+ "movi v25.4s, #0x0\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+ "movi v26.4s, #0x0\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+ "cbz %w[beta0], 5f\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip if no complete loops.
+ "cbz %w[loops], 4f\n"
+ "b 1f\n"
+
+ // If beta is non-zero, need to load and multiply by beta
+ "5:\n"
+ "ld1r {v4.4s}, [%[betaptr]]\n"
+ "ldr q16, [%[c_ptr0]]\n"
+ "ldr q17, [%[c_ptr0], #16]\n"
+ "ldr q18, [%[c_ptr0], #32]\n"
+ "ldr q19, [%[c_ptr0], #48]\n"
+
+ "ldr q20, [%[c_ptr1]]\n"
+ "fmul v16.4s, v16.4s, v4.4s\n"
+ "ldr q21, [%[c_ptr1], #16]\n"
+ "fmul v17.4s, v17.4s, v4.4s\n"
+ "ldr q22, [%[c_ptr1], #32]\n"
+ "fmul v18.4s, v18.4s, v4.4s\n"
+ "ldr q23, [%[c_ptr1], #48]\n"
+ "fmul v19.4s, v19.4s, v4.4s\n"
+
+ "ldr q24, [%[c_ptr2]]\n"
+ "fmul v20.4s, v20.4s, v4.4s\n"
+ "ldr q25, [%[c_ptr2], #16]\n"
+ "fmul v21.4s, v21.4s, v4.4s\n"
+ "ldr q26, [%[c_ptr2], #32]\n"
+ "fmul v22.4s, v22.4s, v4.4s\n"
+ "ldr q27, [%[c_ptr2], #48]\n"
+ "fmul v23.4s, v23.4s, v4.4s\n"
+
+ "ldr q28, [%[c_ptr3]]\n"
+ "fmul v24.4s, v24.4s, v4.4s\n"
+ "ldr q29, [%[c_ptr3], #16]\n"
+ "fmul v25.4s, v25.4s, v4.4s\n"
+ "ldr q30, [%[c_ptr3], #32]\n"
+ "fmul v26.4s, v26.4s, v4.4s\n"
+ "ldr q31, [%[c_ptr3], #48]\n"
+ "fmul v27.4s, v27.4s, v4.4s\n"
+
+ "fmul v28.4s, v28.4s, v4.4s\n"
+ "fmul v29.4s, v29.4s, v4.4s\n"
+ "fmul v30.4s, v30.4s, v4.4s\n"
+ "fmul v31.4s, v31.4s, v4.4s\n"
+
+ "cbz %w[loops], 4f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "add %[a_ptr0], %[a_ptr0], #32\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "add %[a_ptr1], %[a_ptr1], #32\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "add %[a_ptr2], %[a_ptr2], #32\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "add %[a_ptr3], %[a_ptr3], #32\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "ldr a0q, [%[a_ptr0]]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "ldr a1q, [%[a_ptr1]]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "ldr a2q, [%[a_ptr2]]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "ldr a3q, [%[a_ptr3]]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "bne 1b\n"
+
+ // Skip to here
+ "4:\n"
+
+ // Detached final iteration
+ // Unroll 0
+ "fmla v16.4s, bb0.4s, a0.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1.s[0]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+ "fmla v24.4s, bb0.4s, a2.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v28.4s, bb0.4s, a3.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[0]\n"
+ "cbnz %w[oddk], 2f\n" // Deal with odd K before we load a0a
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "ldr a0aq, [%[a_ptr0], #16]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "ldr a1aq, [%[a_ptr1], #16]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "ldr a2aq, [%[a_ptr2], #16]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "ldr a3aq, [%[a_ptr3], #16]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 4
+ "fmla v16.4s, bb0.4s, a0a.s[0]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[0]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[0]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[0]\n"
+ "ldr b0q, [%[b_ptr]]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[0]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[0]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[0]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 5
+ "fmla v16.4s, b0a.4s, a0a.s[1]\n"
+ "fmla v20.4s, b0a.4s, a1a.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v24.4s, b0a.4s, a2a.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3a.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0a.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[1]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0a.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0a.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 6
+ "fmla v16.4s, bb0.4s, a0a.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1a.s[2]\n"
+ "fmla v24.4s, bb0.4s, a2a.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3a.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0a.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1a.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2a.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3a.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0a.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1a.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2a.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3a.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0a.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1a.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2a.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3a.s[2]\n"
+
+ // Unroll 7
+ "fmla v16.4s, b0a.4s, a0a.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0a.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0a.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0a.s[3]\n"
+
+ "fmla v20.4s, b0a.4s, a1a.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1a.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1a.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1a.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2a.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2a.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2a.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2a.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3a.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3a.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3a.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3a.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+ "b 3f\n"
+
+ // Odd K case: Just do 4 more.
+ "2:\n"
+ "fmla v21.4s, bb1.4s, a1.s[0]\n"
+ "fmla v25.4s, bb1.4s, a2.s[0]\n"
+ "fmla v29.4s, bb1.4s, a3.s[0]\n"
+ "ldr b1q, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[0]\n"
+ "fmla v22.4s, bb2.4s, a1.s[0]\n"
+ "fmla v26.4s, bb2.4s, a2.s[0]\n"
+ "fmla v30.4s, bb2.4s, a3.s[0]\n"
+ "ldr b2q, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[0]\n"
+ "fmla v23.4s, bb3.4s, a1.s[0]\n"
+ "fmla v27.4s, bb3.4s, a2.s[0]\n"
+ "fmla v31.4s, bb3.4s, a3.s[0]\n"
+ "ldr b3q, [%[b_ptr], #48]\n"
+
+ // Unroll 1
+ "fmla v16.4s, b0a.4s, a0.s[1]\n"
+ "add %[b_ptr], %[b_ptr], %[ldb]\n"
+ "fmla v20.4s, b0a.4s, a1.s[1]\n"
+ "fmla v24.4s, b0a.4s, a2.s[1]\n"
+ "fmla v28.4s, b0a.4s, a3.s[1]\n"
+ "ldr b0aq, [%[b_ptr]]\n"
+
+ "fmla v17.4s, b1a.4s, a0.s[1]\n"
+ "fmla v21.4s, b1a.4s, a1.s[1]\n"
+ "subs %w[loops], %w[loops], #1\n"
+ "fmla v25.4s, b1a.4s, a2.s[1]\n"
+ "fmla v29.4s, b1a.4s, a3.s[1]\n"
+ "ldr b1aq, [%[b_ptr], #16]\n"
+
+ "fmla v18.4s, b2a.4s, a0.s[1]\n"
+ "fmla v22.4s, b2a.4s, a1.s[1]\n"
+ "fmla v26.4s, b2a.4s, a2.s[1]\n"
+ "fmla v30.4s, b2a.4s, a3.s[1]\n"
+ "ldr b2aq, [%[b_ptr], #32]\n"
+
+ "fmla v19.4s, b3a.4s, a0.s[1]\n"
+ "fmla v23.4s, b3a.4s, a1.s[1]\n"
+ "fmla v27.4s, b3a.4s, a2.s[1]\n"
+ "fmla v31.4s, b3a.4s, a3.s[1]\n"
+ "ldr b3aq, [%[b_ptr], #48]\n"
+
+ // Unroll 2
+ "fmla v16.4s, bb0.4s, a0.s[2]\n"
+ "fmla v20.4s, bb0.4s, a1.s[2]\n"
+ "fmla v24.4s, bb0.4s, a2.s[2]\n"
+ "fmla v28.4s, bb0.4s, a3.s[2]\n"
+
+ "fmla v17.4s, bb1.4s, a0.s[2]\n"
+ "fmla v21.4s, bb1.4s, a1.s[2]\n"
+ "fmla v25.4s, bb1.4s, a2.s[2]\n"
+ "fmla v29.4s, bb1.4s, a3.s[2]\n"
+
+ "fmla v18.4s, bb2.4s, a0.s[2]\n"
+ "fmla v22.4s, bb2.4s, a1.s[2]\n"
+ "fmla v26.4s, bb2.4s, a2.s[2]\n"
+ "fmla v30.4s, bb2.4s, a3.s[2]\n"
+
+ "fmla v19.4s, bb3.4s, a0.s[2]\n"
+ "fmla v23.4s, bb3.4s, a1.s[2]\n"
+ "fmla v27.4s, bb3.4s, a2.s[2]\n"
+ "fmla v31.4s, bb3.4s, a3.s[2]\n"
+
+ // Unroll 3
+ "fmla v16.4s, b0a.4s, a0.s[3]\n"
+ "fmla v17.4s, b1a.4s, a0.s[3]\n"
+ "fmla v18.4s, b2a.4s, a0.s[3]\n"
+ "fmla v19.4s, b3a.4s, a0.s[3]\n"
+
+ "fmla v20.4s, b0a.4s, a1.s[3]\n"
+ "str q16, [%[c_ptr0]]\n"
+ "fmla v21.4s, b1a.4s, a1.s[3]\n"
+ "str q17, [%[c_ptr0], #16]\n"
+ "fmla v22.4s, b2a.4s, a1.s[3]\n"
+ "str q18, [%[c_ptr0], #32]\n"
+ "fmla v23.4s, b3a.4s, a1.s[3]\n"
+ "str q19, [%[c_ptr0], #48]\n"
+
+ "fmla v24.4s, b0a.4s, a2.s[3]\n"
+ "str q20, [%[c_ptr1]]\n"
+ "fmla v25.4s, b1a.4s, a2.s[3]\n"
+ "str q21, [%[c_ptr1], #16]\n"
+ "fmla v26.4s, b2a.4s, a2.s[3]\n"
+ "str q22, [%[c_ptr1], #32]\n"
+ "fmla v27.4s, b3a.4s, a2.s[3]\n"
+ "str q23, [%[c_ptr1], #48]\n"
+
+ "fmla v28.4s, b0a.4s, a3.s[3]\n"
+ "str q24, [%[c_ptr2]]\n"
+ "fmla v29.4s, b1a.4s, a3.s[3]\n"
+ "str q25, [%[c_ptr2], #16]\n"
+ "fmla v30.4s, b2a.4s, a3.s[3]\n"
+ "str q26, [%[c_ptr2], #32]\n"
+ "fmla v31.4s, b3a.4s, a3.s[3]\n"
+ "str q27, [%[c_ptr2], #48]\n"
+
+ "3:\n"
+ "str q28, [%[c_ptr3]]\n"
+ "str q29, [%[c_ptr3], #16]\n"
+ "str q30, [%[c_ptr3], #32]\n"
+ "str q31, [%[c_ptr3], #48]\n"
+
+ : [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1), [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3),
+ [b_ptr] "+r"(b_ptr), [loops] "+r"(loops)
+ : [ldb] "r"(ldbb), [oddk] "r"(oddk), [beta0] "r"(beta0), [betaptr] "r"(&beta),
+ [c_ptr0] "r"(c_ptr0), [c_ptr1] "r"(c_ptr1), [c_ptr2] "r"(c_ptr2), [c_ptr3] "r"(c_ptr3)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+ "cc", "memory");
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
new file mode 100644
index 0000000000..c89514f98e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+namespace arm_gemm
+{
+// Actual kernel implementations
+void a64_sgemv_pretransposed(const float *, int, const float *, float *, float, int, int);
+
+// Pretransposed SGEMV strategy class.
+class sgemv_pretransposed
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)(const float *, int, const float *, float *, float, int, int);
+
+ /* Describes the data layout for matrix (A) input */
+
+ /* Note that often GEMV is expressed as a GEMM with M=1, i.e. A is the
+ * (row) vector and B is the matrix, but the standard GEMV arrangement
+ * is matrix A times (column) vector X. "A_transpose" is expressed in
+ * terms of this standard arrangement, so if the A matrix is in fact the
+ * B matrix from a GEMM call, the sense of the transpose needs to be
+ * reversed. */
+ static const int A_interleave = 32;
+ static const int A_block = 1;
+ static const bool A_transpose = false;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 32;
+ static const int k_unroll = 1;
+
+ kern_type kernel = a64_sgemv_pretransposed;
+
+ sgemv_pretransposed(const CPUInfo *ci)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
new file mode 100644
index 0000000000..290759822a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm
+{
+void a64_sgemv_pretransposed(const float *A, int lda, const float *X, float *Y, float beta, int M, int N)
+{
+ const bool beta0 = (beta == 0.0f);
+ const bool beta1 = (beta == 1.0f);
+
+ for(int x = 0; x < N; x += 32)
+ {
+ float *y_ptr = Y + x;
+
+ // How many elements are we processing in this loop?
+ int l = std::min(N - x, 32);
+
+ register float32x4_t r0 asm("v24");
+ register float32x4_t r1 asm("v25");
+ register float32x4_t r2 asm("v26");
+ register float32x4_t r3 asm("v27");
+ register float32x4_t r4 asm("v28");
+ register float32x4_t r5 asm("v29");
+ register float32x4_t r6 asm("v30");
+ register float32x4_t r7 asm("v31");
+
+ register float32x4_t x0 asm("v0");
+ register float32x4_t x0a asm("v1");
+
+ const float *x_ptr = X;
+ const float *a_ptr = A + ((x / 32) * lda);
+
+ if(beta0)
+ {
+ r0 = r1 = r2 = r3 = r4 = r5 = r6 = r7 = vdupq_n_f32(0.0f);
+ }
+ else
+ {
+ if(l == 32)
+ {
+ // Fastest path - load all 8 vectors
+ r0 = vld1q_f32(y_ptr);
+ r1 = vld1q_f32(y_ptr + 4);
+ r2 = vld1q_f32(y_ptr + 8);
+ r3 = vld1q_f32(y_ptr + 12);
+ r4 = vld1q_f32(y_ptr + 16);
+ r5 = vld1q_f32(y_ptr + 20);
+ r6 = vld1q_f32(y_ptr + 24);
+ r7 = vld1q_f32(y_ptr + 28);
+ }
+ else
+ {
+ // Slow case - leftovers. Note that we don't care about
+ // out-of-range vectors and lanes as we will throw them away at
+ // the end.
+ int vecs = l / 4; // How many leftover vectors?
+ int oddbits = l % 4; // And how many odd single values?
+
+ if(oddbits)
+ {
+ // Load the outstanding odd values into a vector first
+ float32x4_t oddvec = vdupq_n_f32(0.0f); // This does not really need to be initialized, but the compiler has a hard time with that.
+ float *oddbase = y_ptr + l - oddbits;
+
+ switch(oddbits)
+ {
+ case 3:
+ oddvec = vld1q_lane_f32(oddbase + 2, oddvec, 2);
+ // fall through
+ case 2:
+ oddvec = vld1q_lane_f32(oddbase + 1, oddvec, 1);
+ // fall through
+ case 1:
+ oddvec = vld1q_lane_f32(oddbase, oddvec, 0);
+ break;
+
+ default:
+ UNREACHABLE("Impossible case in switch.");
+ }
+
+ // Now load the whole vectors, putting the oddments in when we run out.
+ do
+ {
+ if(vecs == 0)
+ {
+ r0 = oddvec;
+ break;
+ }
+
+ r0 = vld1q_f32(y_ptr);
+ if(--vecs == 0)
+ {
+ r1 = oddvec;
+ break;
+ }
+
+ r1 = vld1q_f32(y_ptr + 4);
+ if(--vecs == 0)
+ {
+ r2 = oddvec;
+ break;
+ }
+
+ r2 = vld1q_f32(y_ptr + 8);
+ if(--vecs == 0)
+ {
+ r3 = oddvec;
+ break;
+ }
+
+ r3 = vld1q_f32(y_ptr + 12);
+ if(--vecs == 0)
+ {
+ r4 = oddvec;
+ break;
+ }
+
+ r4 = vld1q_f32(y_ptr + 16);
+ if(--vecs == 0)
+ {
+ r5 = oddvec;
+ break;
+ }
+
+ r5 = vld1q_f32(y_ptr + 20);
+ if(--vecs == 0)
+ {
+ r6 = oddvec;
+ break;
+ }
+
+ r6 = vld1q_f32(y_ptr + 24);
+ r7 = oddvec;
+ }
+ while(0);
+ }
+ else
+ {
+ // Slightly less slow path - just load the whole vectors
+ do
+ {
+ // It can't be the case that oddbits==0 AND vecs==0 or we wouldn't be here.
+ if(vecs == 0)
+ {
+ UNREACHABLE("Impossible lack of work to do");
+ }
+
+ r0 = vld1q_f32(y_ptr);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r1 = vld1q_f32(y_ptr + 4);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r2 = vld1q_f32(y_ptr + 8);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r3 = vld1q_f32(y_ptr + 12);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r4 = vld1q_f32(y_ptr + 16);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r5 = vld1q_f32(y_ptr + 20);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ r6 = vld1q_f32(y_ptr + 24);
+ }
+ while(0);
+ }
+ }
+
+ if(!beta1)
+ {
+ const float32x4_t vb = vdupq_n_f32(beta);
+
+ r0 = vmulq_f32(r0, vb);
+ r1 = vmulq_f32(r1, vb);
+ r2 = vmulq_f32(r2, vb);
+ r3 = vmulq_f32(r3, vb);
+ r4 = vmulq_f32(r4, vb);
+ r5 = vmulq_f32(r5, vb);
+ r6 = vmulq_f32(r6, vb);
+ r7 = vmulq_f32(r7, vb);
+ }
+ }
+
+ if(M >= 8)
+ {
+ int k = (M / 8) - 1;
+ x0 = vld1q_f32(x_ptr);
+
+ __asm __volatile(
+ "ldr q2, [%[a_ptr], #0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr q10, [%[a_ptr], #128]\n"
+ "ldr q11, [%[a_ptr], #144]\n"
+ "ldr q12, [%[a_ptr], #160]\n"
+ "ldr q13, [%[a_ptr], #176]\n"
+ "ldr q14, [%[a_ptr], #192]\n"
+ "ldr q15, [%[a_ptr], #208]\n"
+ "ldr q16, [%[a_ptr], #224]\n"
+ "ldr q17, [%[a_ptr], #240]\n"
+ "ldr q18, [%[a_ptr], #256]\n"
+ "ldr q19, [%[a_ptr], #272]\n"
+ "ldr q20, [%[a_ptr], #288]\n"
+ "ldr q21, [%[a_ptr], #304]\n"
+ "ldr q22, [%[a_ptr], #320]\n"
+ "ldr q23, [%[a_ptr], #336]\n" ASM_PREFETCH("[%[a_ptr], #384]")
+ ASM_PREFETCH("[%[a_ptr], #448]")
+ ASM_PREFETCH("[%[a_ptr], #512]")
+ ASM_PREFETCH("[%[a_ptr], #576]")
+ ASM_PREFETCH("[%[a_ptr], #640]")
+ ASM_PREFETCH("[%[a_ptr], #704]")
+ ASM_PREFETCH("[%[a_ptr], #768]")
+ ASM_PREFETCH("[%[a_ptr], #832]")
+ ASM_PREFETCH("[%[a_ptr], #896]")
+ ASM_PREFETCH("[%[a_ptr], #960]")
+ ASM_PREFETCH("[%[a_ptr], #1024]")
+ ASM_PREFETCH("[%[a_ptr], #1088]")
+ ASM_PREFETCH("[%[a_ptr], #1152]")
+ ASM_PREFETCH("[%[a_ptr], #1216]")
+ ASM_PREFETCH("[%[a_ptr], #1280]")
+ ASM_PREFETCH("[%[a_ptr], #1344]")
+ ASM_PREFETCH("[%[a_ptr], #1408]")
+ ASM_PREFETCH("[%[a_ptr], #1472]")
+ ASM_PREFETCH("[%[a_ptr], #1536]")
+ ASM_PREFETCH("[%[a_ptr], #1600]")
+ ASM_PREFETCH("[%[a_ptr], #1664]")
+ ASM_PREFETCH("[%[a_ptr], #1728]")
+ ASM_PREFETCH("[%[a_ptr], #1792]")
+ ASM_PREFETCH("[%[a_ptr], #1856]")
+ ASM_PREFETCH("[%[a_ptr], #1920]")
+ ASM_PREFETCH("[%[a_ptr], #1984]")
+ "add %[a_ptr], %[a_ptr], #352\n"
+
+ "cbz %w[k], 2f\n"
+
+ "1:\n"
+ // Unroll 0
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr %q[x0a], [%[x_ptr], #16]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #0]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #16]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #32]\n"
+ "add %[x_ptr], %[x_ptr], #32\n" ASM_PREFETCH("[%[a_ptr], #1664]")
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #48]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #64]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #80]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #96]\n" ASM_PREFETCH("[%[a_ptr], #1728]")
+
+ // Unroll 1
+ "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
+ "ldr q10, [%[a_ptr], #112]\n"
+ "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
+ "ldr q11, [%[a_ptr], #128]\n"
+ "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
+ "ldr q12, [%[a_ptr], #144]\n"
+ "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
+ "ldr q13, [%[a_ptr], #160]\n" ASM_PREFETCH("[%[a_ptr], #1792]")
+ "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
+ "ldr q14, [%[a_ptr], #176]\n"
+ "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
+ "ldr q15, [%[a_ptr], #192]\n"
+ "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
+ "ldr q16, [%[a_ptr], #208]\n"
+ "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
+ "ldr q17, [%[a_ptr], #224]\n" ASM_PREFETCH("[%[a_ptr], #1856]")
+
+ // Unroll 2
+ "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
+ "ldr q18, [%[a_ptr], #240]\n"
+ "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
+ "ldr q19, [%[a_ptr], #256]\n"
+ "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
+ "ldr q20, [%[a_ptr], #272]\n"
+ "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
+ "ldr q21, [%[a_ptr], #288]\n" ASM_PREFETCH("[%[a_ptr], #1920]")
+ "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
+ "ldr q22, [%[a_ptr], #304]\n"
+ "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
+ "ldr q23, [%[a_ptr], #320]\n"
+ "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
+ "ldr q2, [%[a_ptr], #336]\n"
+ "ldr q3, [%[a_ptr], #352]\n"
+ "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
+ "ldr q4, [%[a_ptr], #368]\n" ASM_PREFETCH("[%[a_ptr], #1984]")
+
+ // Unroll 3
+ "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
+ "ldr q5, [%[a_ptr], #384]\n"
+ "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
+ "ldr q6, [%[a_ptr], #400]\n"
+ "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
+ "ldr q7, [%[a_ptr], #416]\n"
+ "fmla %[r3].4s, v8.4s, %[x0].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2048]")
+ "ldr q8, [%[a_ptr], #432]\n"
+ "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
+ "ldr q9, [%[a_ptr], #448]\n"
+ "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
+ "ldr q10, [%[a_ptr], #464]\n"
+ "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
+ "ldr q11, [%[a_ptr], #480]\n"
+ "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
+ "ldr q12, [%[a_ptr], #496]\n" ASM_PREFETCH("[%[a_ptr], #2112]")
+
+ // Unroll 4
+ "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
+ "ldr %q[x0], [%[x_ptr]]\n"
+ "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
+ "ldr q14, [%[a_ptr], #512]\n"
+ "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
+ "ldr q15, [%[a_ptr], #528]\n"
+ "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n" ASM_PREFETCH("[%[a_ptr], #2176]")
+ "ldr q16, [%[a_ptr], #544]\n"
+ "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
+ "ldr q17, [%[a_ptr], #560]\n"
+ "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
+ "ldr q18, [%[a_ptr], #576]\n"
+ "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
+ "ldr q19, [%[a_ptr], #592]\n"
+ "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
+ "ldr q20, [%[a_ptr], #608]\n" ASM_PREFETCH("[%[a_ptr], #2240]")
+
+ // Unroll 5
+ "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
+ "ldr q21, [%[a_ptr], #624]\n"
+ "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
+ "ldr q22, [%[a_ptr], #640]\n"
+ "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
+ "ldr q23, [%[a_ptr], #656]\n"
+ "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
+ "ldr q2, [%[a_ptr], #672]\n" ASM_PREFETCH("[%[a_ptr], #2304]")
+ "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
+ "ldr q3, [%[a_ptr], #688]\n"
+ "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
+ "ldr q4, [%[a_ptr], #704]\n"
+ "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
+ "ldr q5, [%[a_ptr], #720]\n"
+ "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
+ "ldr q6, [%[a_ptr], #736]\n" ASM_PREFETCH("[%[a_ptr], #2368]")
+
+ // Unroll 6
+ "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
+ "ldr q7, [%[a_ptr], #752]\n"
+ "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
+ "ldr q8, [%[a_ptr], #768]\n"
+ "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
+ "ldr q9, [%[a_ptr], #784]\n"
+ "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
+ "ldr q10, [%[a_ptr], #800]\n" ASM_PREFETCH("[%[a_ptr], #2432]")
+ "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
+ "ldr q11, [%[a_ptr], #816]\n"
+ "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
+ "ldr q12, [%[a_ptr], #832]\n"
+ "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
+ "ldr q13, [%[a_ptr], #848]\n"
+ "ldr q14, [%[a_ptr], #864]\n"
+ "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
+ "ldr q15, [%[a_ptr], #880]\n" ASM_PREFETCH("[%[a_ptr], #2496]")
+
+ // Unroll 7
+ "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
+ "ldr q16, [%[a_ptr], #896]\n"
+ "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
+ "ldr q17, [%[a_ptr], #912]\n"
+ "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
+ "ldr q18, [%[a_ptr], #928]\n"
+ "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n" ASM_PREFETCH("[%[a_ptr], #2560]")
+ "ldr q19, [%[a_ptr], #944]\n"
+ "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
+ "ldr q20, [%[a_ptr], #960]\n"
+ "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
+ "ldr q21, [%[a_ptr], #976]\n"
+ "add %[a_ptr], %[a_ptr], #1024\n"
+ "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
+ "ldr q22, [%[a_ptr], #-32]\n"
+ "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
+ "ldr q23, [%[a_ptr], #-16]\n" ASM_PREFETCH("[%[a_ptr], #1600]")
+ "bne 1b\n"
+
+ // Detached final iteration
+ "2:\n"
+
+ // Unroll 0
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr %q[x0a], [%[x_ptr], #16]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #0]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #16]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #32]\n"
+ "add %[x_ptr], %[x_ptr], #32\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #48]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #64]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #80]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #96]\n"
+
+ // Unroll 1
+ "fmla %[r0].4s, v10.4s, %[x0].s[1]\n"
+ "ldr q10, [%[a_ptr], #112]\n"
+ "fmla %[r1].4s, v11.4s, %[x0].s[1]\n"
+ "ldr q11, [%[a_ptr], #128]\n"
+ "fmla %[r2].4s, v12.4s, %[x0].s[1]\n"
+ "ldr q12, [%[a_ptr], #144]\n"
+ "fmla %[r3].4s, v13.4s, %[x0].s[1]\n"
+ "ldr q13, [%[a_ptr], #160]\n"
+ "fmla %[r4].4s, v14.4s, %[x0].s[1]\n"
+ "ldr q14, [%[a_ptr], #176]\n"
+ "fmla %[r5].4s, v15.4s, %[x0].s[1]\n"
+ "ldr q15, [%[a_ptr], #192]\n"
+ "fmla %[r6].4s, v16.4s, %[x0].s[1]\n"
+ "ldr q16, [%[a_ptr], #208]\n"
+ "fmla %[r7].4s, v17.4s, %[x0].s[1]\n"
+ "ldr q17, [%[a_ptr], #224]\n"
+
+ // Unroll 2
+ "fmla %[r0].4s, v18.4s, %[x0].s[2]\n"
+ "ldr q18, [%[a_ptr], #240]\n"
+ "fmla %[r1].4s, v19.4s, %[x0].s[2]\n"
+ "ldr q19, [%[a_ptr], #256]\n"
+ "fmla %[r2].4s, v20.4s, %[x0].s[2]\n"
+ "ldr q20, [%[a_ptr], #272]\n"
+ "fmla %[r3].4s, v21.4s, %[x0].s[2]\n"
+ "ldr q21, [%[a_ptr], #288]\n"
+ "fmla %[r4].4s, v22.4s, %[x0].s[2]\n"
+ "ldr q22, [%[a_ptr], #304]\n"
+ "fmla %[r5].4s, v23.4s, %[x0].s[2]\n"
+ "ldr q23, [%[a_ptr], #320]\n"
+ "fmla %[r6].4s, v3.4s, %[x0].s[2]\n"
+ "ldr q2, [%[a_ptr], #336]\n"
+ "ldr q3, [%[a_ptr], #352]\n"
+ "fmla %[r7].4s, v4.4s, %[x0].s[2]\n"
+ "ldr q4, [%[a_ptr], #368]\n"
+
+ // Unroll 3
+ "fmla %[r0].4s, v5.4s, %[x0].s[3]\n"
+ "ldr q5, [%[a_ptr], #384]\n"
+ "fmla %[r1].4s, v6.4s, %[x0].s[3]\n"
+ "ldr q6, [%[a_ptr], #400]\n"
+ "fmla %[r2].4s, v7.4s, %[x0].s[3]\n"
+ "ldr q7, [%[a_ptr], #416]\n"
+ "fmla %[r3].4s, v8.4s, %[x0].s[3]\n"
+ "ldr q8, [%[a_ptr], #432]\n"
+ "fmla %[r4].4s, v9.4s, %[x0].s[3]\n"
+ "ldr q9, [%[a_ptr], #448]\n"
+ "fmla %[r5].4s, v10.4s, %[x0].s[3]\n"
+ "ldr q10, [%[a_ptr], #464]\n"
+ "fmla %[r6].4s, v11.4s, %[x0].s[3]\n"
+ "ldr q11, [%[a_ptr], #480]\n"
+ "fmla %[r7].4s, v12.4s, %[x0].s[3]\n"
+ "ldr q12, [%[a_ptr], #496]\n"
+
+ // Unroll 4
+ "fmla %[r0].4s, v13.4s, %[x0a].s[0]\n"
+ "fmla %[r1].4s, v14.4s, %[x0a].s[0]\n"
+ "ldr q14, [%[a_ptr], #512]\n"
+ "fmla %[r2].4s, v15.4s, %[x0a].s[0]\n"
+ "ldr q15, [%[a_ptr], #528]\n"
+ "fmla %[r3].4s, v16.4s, %[x0a].s[0]\n"
+ "ldr q16, [%[a_ptr], #544]\n"
+ "fmla %[r4].4s, v17.4s, %[x0a].s[0]\n"
+ "ldr q17, [%[a_ptr], #560]\n"
+ "fmla %[r5].4s, v18.4s, %[x0a].s[0]\n"
+ "ldr q18, [%[a_ptr], #576]\n"
+ "fmla %[r6].4s, v19.4s, %[x0a].s[0]\n"
+ "ldr q19, [%[a_ptr], #592]\n"
+ "fmla %[r7].4s, v20.4s, %[x0a].s[0]\n"
+ "ldr q20, [%[a_ptr], #608]\n"
+
+ // Unroll 5
+ "fmla %[r0].4s, v21.4s, %[x0a].s[1]\n"
+ "ldr q21, [%[a_ptr], #624]\n"
+ "fmla %[r1].4s, v22.4s, %[x0a].s[1]\n"
+ "ldr q22, [%[a_ptr], #640]\n"
+ "fmla %[r2].4s, v23.4s, %[x0a].s[1]\n"
+ "ldr q23, [%[a_ptr], #656]\n"
+ "fmla %[r3].4s, v2.4s, %[x0a].s[1]\n"
+ "add %[a_ptr], %[a_ptr], #672\n"
+ "fmla %[r4].4s, v3.4s, %[x0a].s[1]\n"
+ "fmla %[r5].4s, v4.4s, %[x0a].s[1]\n"
+ "fmla %[r6].4s, v5.4s, %[x0a].s[1]\n"
+ "fmla %[r7].4s, v6.4s, %[x0a].s[1]\n"
+
+ // Unroll 6
+ "fmla %[r0].4s, v7.4s, %[x0a].s[2]\n"
+ "fmla %[r1].4s, v8.4s, %[x0a].s[2]\n"
+ "fmla %[r2].4s, v9.4s, %[x0a].s[2]\n"
+ "fmla %[r3].4s, v10.4s, %[x0a].s[2]\n"
+ "fmla %[r4].4s, v11.4s, %[x0a].s[2]\n"
+ "fmla %[r5].4s, v12.4s, %[x0a].s[2]\n"
+ "fmla %[r6].4s, v14.4s, %[x0a].s[2]\n"
+ "fmla %[r7].4s, v15.4s, %[x0a].s[2]\n"
+
+ // Unroll 7
+ "fmla %[r0].4s, v16.4s, %[x0a].s[3]\n"
+ "fmla %[r1].4s, v17.4s, %[x0a].s[3]\n"
+ "fmla %[r2].4s, v18.4s, %[x0a].s[3]\n"
+ "fmla %[r3].4s, v19.4s, %[x0a].s[3]\n"
+ "fmla %[r4].4s, v20.4s, %[x0a].s[3]\n"
+ "fmla %[r5].4s, v21.4s, %[x0a].s[3]\n"
+ "fmla %[r6].4s, v22.4s, %[x0a].s[3]\n"
+ "fmla %[r7].4s, v23.4s, %[x0a].s[3]\n"
+ :
+ [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+ [x0] "+w"(x0), [x0a] "+w"(x0a), [k] "+r"(k),
+ [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+ [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+ :
+ : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14",
+ "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "x20", "x21", "cc", "memory");
+ }
+
+ // Deal with ragged M
+ if(M % 8)
+ {
+ int l = (M % 8) - 1;
+
+ __asm __volatile(
+ "ldr q2, [%[a_ptr], #0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr %s[x0], [%[x_ptr]]\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+
+ "cbz %w[l], 2f\n"
+
+ "1:\n"
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "subs %w[l], %w[l], #1\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "ldr q3, [%[a_ptr], #16]\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "ldr q4, [%[a_ptr], #32]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "ldr q5, [%[a_ptr], #48]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "ldr q6, [%[a_ptr], #64]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "ldr q7, [%[a_ptr], #80]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "ldr q8, [%[a_ptr], #96]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ "ldr q9, [%[a_ptr], #112]\n"
+ "ldr %s[x0], [%[x_ptr]]\n"
+ "add %[a_ptr], %[a_ptr], #128\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+ "bne 1b\n"
+
+ "2:\n"
+
+ "fmla %[r0].4s, v2.4s, %[x0].s[0]\n"
+ "fmla %[r1].4s, v3.4s, %[x0].s[0]\n"
+ "fmla %[r2].4s, v4.4s, %[x0].s[0]\n"
+ "fmla %[r3].4s, v5.4s, %[x0].s[0]\n"
+ "fmla %[r4].4s, v6.4s, %[x0].s[0]\n"
+ "fmla %[r5].4s, v7.4s, %[x0].s[0]\n"
+ "fmla %[r6].4s, v8.4s, %[x0].s[0]\n"
+ "fmla %[r7].4s, v9.4s, %[x0].s[0]\n"
+ :
+ [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr),
+ [x0] "+w"(x0), [l] "+r"(l),
+ [r0] "+w"(r0), [r1] "+w"(r1), [r2] "+w"(r2), [r3] "+w"(r3),
+ [r4] "+w"(r4), [r5] "+w"(r5), [r6] "+w"(r6), [r7] "+w"(r7)
+ :
+ : "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "cc", "memory");
+ }
+
+ if(l == 32)
+ {
+ // Fast path
+ vst1q_f32(y_ptr, r0);
+ vst1q_f32(y_ptr + 4, r1);
+ vst1q_f32(y_ptr + 8, r2);
+ vst1q_f32(y_ptr + 12, r3);
+ vst1q_f32(y_ptr + 16, r4);
+ vst1q_f32(y_ptr + 20, r5);
+ vst1q_f32(y_ptr + 24, r6);
+ vst1q_f32(y_ptr + 28, r7);
+ }
+ else
+ {
+ int vecs = l / 4;
+ int oddbits = l % 4;
+
+ if(oddbits)
+ {
+ // As above - slowest path deals with vectors plus odd bits
+ float32x4_t oddvec;
+
+ do
+ {
+ if(vecs == 0)
+ {
+ oddvec = r0;
+ break;
+ }
+
+ vst1q_f32(y_ptr, r0);
+ if(--vecs == 0)
+ {
+ oddvec = r1;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 4, r1);
+ if(--vecs == 0)
+ {
+ oddvec = r2;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 8, r2);
+ if(--vecs == 0)
+ {
+ oddvec = r3;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 12, r3);
+ if(--vecs == 0)
+ {
+ oddvec = r4;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 16, r4);
+ if(--vecs == 0)
+ {
+ oddvec = r5;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 20, r5);
+ if(--vecs == 0)
+ {
+ oddvec = r6;
+ break;
+ }
+
+ vst1q_f32(y_ptr + 24, r6);
+ oddvec = r7;
+ }
+ while(0);
+
+ float *oddbase = y_ptr + l - oddbits;
+
+ switch(oddbits)
+ {
+ case 3:
+ vst1q_lane_f32(oddbase + 2, oddvec, 2);
+ // fall through
+ case 2:
+ vst1q_lane_f32(oddbase + 1, oddvec, 1);
+ // fall through
+ case 1:
+ vst1q_lane_f32(oddbase, oddvec, 0);
+ break;
+
+ default:
+ // oddbits must be 1, 2 or 3.
+ UNREACHABLE("Impossible case in switch.");
+ }
+ }
+ else
+ {
+ // As above - medium path deals with vectors only
+ do
+ {
+ if(vecs == 0)
+ {
+ UNREACHABLE("vecs and oddbits can't both be 0");
+ }
+
+ vst1q_f32(y_ptr, r0);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 4, r1);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 8, r2);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 12, r3);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 16, r4);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 20, r5);
+ if(--vecs == 0)
+ {
+ break;
+ }
+
+ vst1q_f32(y_ptr + 24, r6);
+ }
+ while(0);
+ }
+ }
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // aarch64
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
index 2a39ca1f07..5b9bd72c89 100644
--- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
@@ -25,11 +25,14 @@
#ifdef __aarch64__
+namespace arm_gemm
+{
// Actual kernel implementations
-#include "generic.hpp"
+void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
// Transposed SGEMV strategy class.
-class sgemv_trans {
+class sgemv_trans
+{
public:
typedef float operand_type;
typedef float result_type;
@@ -37,14 +40,16 @@ public:
typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
/* Kernel blocking parameters */
- static const int out_width = 12;
- static const int k_unroll = 1;
+ static const int out_width = 96;
+ static const int k_unroll = 1;
- kern_type kernel;
+ kern_type kernel = a64_sgemv_trans;
- sgemv_trans(const CPUInfo *ci) {
- kernel = a64_sgemv_trans;
+ sgemv_trans(const CPUInfo *ci)
+ {
}
};
+} // namespace arm_gemm
+
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
new file mode 100644
index 0000000000..3309baff3a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <cstddef>
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+// Kernel implementation - transposed GEMV
+//
+// The kernel will process "M" rows of A (= steps of dot product) and "N"
+// columns (= dot products total)
+//
+// General plan is to do as many columns simultaneously as possible - a
+// reasonable limit is half the NEON regfile = 64 total accumulators.
+//
+// It's possible that messing around with sub-blocking M and N can yield
+// higher performance, but that's left to the outer loop. In this kernel we
+// process all of M at the same time.
+
+// How far ahead to prefetch for the first and subsequent prefetches.
+// These values work for A72 on JunoR2...
+
+#define FIRST_PFD 9
+#define PFD 6
+
+namespace arm_gemm
+{
+void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float alpha, int lda, int M, int N)
+{
+ const float *a_ptr_base = Astart;
+ float *y_ptr = Ystart;
+
+ register const float32x4_t va asm("v1") = vdupq_n_f32(alpha);
+
+ int firstpfd = FIRST_PFD;
+ if(firstpfd > M)
+ {
+ firstpfd = (M - 1);
+ }
+
+ int pfd = PFD;
+ if(pfd > M)
+ {
+ pfd = (M - 1);
+ }
+
+ ptrdiff_t jump = lda * sizeof(int);
+
+ for(; N >= 96; N -= 96)
+ {
+ int k = M - 1;
+
+ const float *a_ptr = a_ptr_base;
+ const float *x_ptr = Xstart;
+ const float *pf_ptr = a_ptr;
+ const float *firstpf_ptr = a_ptr;
+ const float *pf_limit = a_ptr + (M * lda);
+
+ for(int i = 0; i < firstpfd; i++)
+ {
+ prefetch_1x(firstpf_ptr);
+ firstpf_ptr += lda;
+ }
+
+ for(int i = 0; i < pfd; i++)
+ {
+ prefetch_5x(pf_ptr + 16);
+ pf_ptr += lda;
+ }
+
+ a_ptr_base += 96;
+
+ __asm __volatile(
+ "movi v8.4s,#0x0\n"
+ "ldr w0, [%[x_ptr]]\n"
+ "movi v9.4s,#0x0\n"
+ "ldr q2, [%[a_ptr], #0]\n"
+ "movi v10.4s,#0x0\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "movi v11.4s,#0x0\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "movi v12.4s,#0x0\n"
+ "ldr q5, [%[a_ptr], #0x30]\n"
+ "movi v13.4s,#0x0\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "movi v14.4s,#0x0\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "movi v15.4s,#0x0\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #64]")
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #128]")
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #192]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #256]")
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n" ASM_PREFETCH("[%[pf_ptr], #320]")
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "movi v28.4s, #0x0\n"
+ "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip everything if there are no iterations of the main loop to do.
+ "cbz %w[k], 10f\n"
+
+ // Loop with all prefetches. Exit this loop when firstpf_ptr
+ // hits pf_limit.
+ "1:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #0x4\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n" ASM_PREFETCH("[%[firstpf_ptr]]")
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "sub %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x00]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "cmp %[firstpf_ptr], %[pf_limit]\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "blt 1b\n"
+
+ // Check that there are still "main" prefetches to do.
+ "cmp %[pf_ptr], %[pf_limit]\n"
+ "bge 9f\n"
+
+ // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
+ "8:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #0x4\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "sub %w[k], %w[k], #1\n" ASM_PREFETCH("[%[x_ptr], #128]")
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x00]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x30]\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "cmp %[pf_ptr], %[pf_limit]\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "blt 8b\n"
+
+ // Check that there is still work to do.
+ "9:\n"
+ "cmp %w[k], #0\n"
+ "beq 10f\n"
+
+ // Loop without prefetches, exit when k hits 0.
+ "2:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #0x4\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "subs %w[k], %w[k], #1\n"
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n"
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n"
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n"
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n"
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x00]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x30]\n"
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x50]\n"
+ "bne 2b\n"
+
+ "10:\n"
+
+ // Final iteration
+ "dup v0.4s, w0\n"
+ "fmla v8.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x60]\n"
+ "fmla v9.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x70]\n"
+ "fmla v10.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x80]\n"
+ "fmla v11.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x90]\n"
+ "fmla v12.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0xa0]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0xb0]\n"
+ "fmla v14.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0xc0]\n"
+ "fmla v15.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0xd0]\n"
+ "fmla v16.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0xe0]\n"
+ "fmla v17.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0xf0]\n"
+ "fmla v18.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x100]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x110]\n"
+ "fmla v20.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[a_ptr], #0x120]\n"
+ "fmla v21.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[a_ptr], #0x130]\n"
+ "fmla v22.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[a_ptr], #0x140]\n"
+ "fmla v23.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[a_ptr], #0x150]\n"
+ "fmla v24.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[a_ptr], #0x160]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[a_ptr], #0x170]\n"
+ "fmla v26.4s, v2.4s, v0.4s\n"
+ "ldr q2, [%[y_ptr]]\n"
+ "fmla v27.4s, v3.4s, v0.4s\n"
+ "ldr q3, [%[y_ptr], #0x10]\n"
+ "fmla v28.4s, v4.4s, v0.4s\n"
+ "ldr q4, [%[y_ptr], #0x20]\n"
+ "fmla v29.4s, v5.4s, v0.4s\n"
+ "ldr q5, [%[y_ptr], #0x30]\n"
+ "fmla v30.4s, v6.4s, v0.4s\n"
+ "ldr q6, [%[y_ptr], #0x40]\n"
+ "fmla v31.4s, v7.4s, v0.4s\n"
+ "ldr q7, [%[y_ptr], #0x50]\n"
+
+ "fmla v2.4s, v8.4s, %[va].4s\n"
+ "ldr q8, [%[y_ptr], #0x60]\n"
+ "fmla v3.4s, v9.4s, %[va].4s\n"
+ "ldr q9, [%[y_ptr], #0x70]\n"
+ "fmla v4.4s, v10.4s, %[va].4s\n"
+ "ldr q10, [%[y_ptr], #0x80]\n"
+ "fmla v5.4s, v11.4s, %[va].4s\n"
+ "ldr q11, [%[y_ptr], #0x90]\n"
+ "fmla v6.4s, v12.4s, %[va].4s\n"
+ "ldr q12, [%[y_ptr], #0xa0]\n"
+ "str q2, [%[y_ptr], #0x00]\n"
+ "fmla v7.4s, v13.4s, %[va].4s\n"
+ "ldr q13, [%[y_ptr], #0xb0]\n"
+ "str q3, [%[y_ptr], #0x10]\n"
+ "fmla v8.4s, v14.4s, %[va].4s\n"
+ "ldr q14, [%[y_ptr], #0xc0]\n"
+ "str q4, [%[y_ptr], #0x20]\n"
+ "fmla v9.4s, v15.4s, %[va].4s\n"
+ "ldr q15, [%[y_ptr], #0xd0]\n"
+ "str q5, [%[y_ptr], #0x30]\n"
+ "fmla v10.4s, v16.4s, %[va].4s\n"
+ "ldr q16, [%[y_ptr], #0xe0]\n"
+ "str q6, [%[y_ptr], #0x40]\n"
+ "fmla v11.4s, v17.4s, %[va].4s\n"
+ "ldr q17, [%[y_ptr], #0xf0]\n"
+ "str q7, [%[y_ptr], #0x50]\n"
+ "fmla v12.4s, v18.4s, %[va].4s\n"
+ "ldr q18, [%[y_ptr], #0x100]\n"
+ "str q8, [%[y_ptr], #0x60]\n"
+ "fmla v13.4s, v19.4s, %[va].4s\n"
+ "ldr q19, [%[y_ptr], #0x110]\n"
+ "str q9, [%[y_ptr], #0x70]\n"
+ "fmla v14.4s, v20.4s, %[va].4s\n"
+ "ldr q20, [%[y_ptr], #0x120]\n"
+ "str q10, [%[y_ptr], #0x80]\n"
+ "fmla v15.4s, v21.4s, %[va].4s\n"
+ "ldr q21, [%[y_ptr], #0x130]\n"
+ "str q11, [%[y_ptr], #0x90]\n"
+ "fmla v16.4s, v22.4s, %[va].4s\n"
+ "ldr q22, [%[y_ptr], #0x140]\n"
+ "str q12, [%[y_ptr], #0xa0]\n"
+ "fmla v17.4s, v23.4s, %[va].4s\n"
+ "ldr q23, [%[y_ptr], #0x150]\n"
+ "str q13, [%[y_ptr], #0xb0]\n"
+ "fmla v18.4s, v24.4s, %[va].4s\n"
+ "ldr q24, [%[y_ptr], #0x160]\n"
+ "str q14, [%[y_ptr], #0xc0]\n"
+ "fmla v19.4s, v25.4s, %[va].4s\n"
+ "ldr q25, [%[y_ptr], #0x170]\n"
+ "str q15, [%[y_ptr], #0xd0]\n"
+ "fmla v20.4s, v26.4s, %[va].4s\n"
+ "str q16, [%[y_ptr], #0xe0]\n"
+ "fmla v21.4s, v27.4s, %[va].4s\n"
+ "str q17, [%[y_ptr], #0xf0]\n"
+ "fmla v22.4s, v28.4s, %[va].4s\n"
+ "str q18, [%[y_ptr], #0x100]\n"
+ "fmla v23.4s, v29.4s, %[va].4s\n"
+ "str q19, [%[y_ptr], #0x110]\n"
+ "fmla v24.4s, v30.4s, %[va].4s\n"
+ "str q20, [%[y_ptr], #0x120]\n"
+ "fmla v25.4s, v31.4s, %[va].4s\n"
+ "str q21, [%[y_ptr], #0x130]\n"
+
+ "stp q22, q23, [%[y_ptr], #0x140]\n"
+ "stp q24, q25, [%[y_ptr], #0x160]\n"
+ "add %[y_ptr], %[y_ptr], #0x180\n"
+
+ : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k), [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr)
+ : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit)
+ : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+
+ if(N > 0)
+ {
+ // Handle N tail - up to 95 stragglers.
+ // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
+ // single value for the remainder.
+
+ // Independent pointers into the matrix for the odd 2 and odd 1.
+ // Double up as flag to indicate whether they are needed.
+ const float *odd2_aptr = NULL;
+ const float *odd1_aptr = NULL;
+
+ // Figure out how much work we need to do.
+ int numvecs = N / 4;
+ int rem = N % 4;
+ int k = M;
+
+ // Set up pointers for the odd 2/1 if needed.
+ if(rem >= 2)
+ {
+ odd2_aptr = a_ptr_base + (numvecs * 4);
+ }
+
+ if(rem & 1)
+ {
+ odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr == NULL ? 0 : 2);
+ }
+
+ const float *a_ptr = a_ptr_base;
+ const float *firstpf_ptr = a_ptr_base;
+ const float *pf_ptr = a_ptr_base;
+ const float *pf_limit = a_ptr + (M * lda);
+
+ const float *x_ptr = Xstart;
+ int vecs = 0; // Working variable to count how many vectors to work on.
+ int dopf = 1; // Track whether we are doing prefetches.
+
+ // Figure out how many cache lines we need to prefetch each time.
+ int numpfs = (N + 15) / 16;
+
+ // Do initial prefetches
+ for(int i = 0; i < firstpfd + 1; i++)
+ {
+ prefetch_1x(firstpf_ptr);
+ firstpf_ptr += lda;
+ }
+
+ // Do "main" prefetches - adapt number to the number we actually need.
+ if(numpfs > 1)
+ {
+ for(int i = 0; i < pfd + 1; i++)
+ {
+ switch(numpfs)
+ {
+ case 2:
+ prefetch_1x(pf_ptr + 16);
+ break;
+
+ case 3:
+ prefetch_2x(pf_ptr + 16);
+ break;
+
+ case 4:
+ prefetch_3x(pf_ptr + 16);
+ break;
+
+ case 5:
+ prefetch_4x(pf_ptr + 16);
+ break;
+
+ case 6:
+ prefetch_5x(pf_ptr + 16);
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ pf_ptr += lda;
+ }
+ }
+ else
+ {
+ // Just disable additional prefetches
+ dopf = 0;
+ }
+
+ // Do the real work
+ __asm __volatile(
+ // Initialize all the vectors - not worth skipping this if only
+ // some are needed.
+ "movi v8.4s,#0x0\n"
+ "ldr w0, [%[x_ptr]]\n"
+ "movi v9.4s,#0x0\n"
+ "movi v10.4s,#0x0\n"
+ "movi v11.4s,#0x0\n"
+ "movi v12.4s,#0x0\n"
+ "movi v13.4s,#0x0\n"
+ "movi v14.4s,#0x0\n"
+ "movi v15.4s,#0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v6.2s, #0x0\n"
+ "movi v5.2s, #0x0\n"
+
+ "1:\n" ASM_PREFETCH("[%[firstpf_ptr]]\n")
+ "11:\n"
+ "dup v0.4s, w0\n"
+ "ldr w0, [%[x_ptr], #4]\n"
+ "add %[x_ptr], %[x_ptr], #4\n"
+
+ "cbz %w[numvecs], 2f\n"
+ "mov %w[vecs], %w[numvecs]\n"
+
+ // Vector 0
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x00]\n"
+ "fmla v8.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 1
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x10]\n"
+ "fmla v9.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 2
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x20]\n"
+ "fmla v10.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 3
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x30]\n"
+ "fmla v11.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 3f\n" ASM_PREFETCH("[%[pf_ptr], #0x40]")
+ "3:\n"
+ "beq 2f\n"
+
+ // Vector 4
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x40]\n"
+ "fmla v12.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 5
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x50]\n"
+ "fmla v13.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 6
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x60]\n"
+ "fmla v14.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 7
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x70]\n"
+ "fmla v15.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 4f\n" ASM_PREFETCH("[%[pf_ptr], #0x80]")
+ "4:\n"
+ "beq 2f\n"
+
+ // Vector 8
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x80]\n"
+ "fmla v16.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 9
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x90]\n"
+ "fmla v17.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 10
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xa0]\n"
+ "fmla v18.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 11
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xb0]\n"
+ "fmla v19.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 5f\n" ASM_PREFETCH("[%[pf_ptr], #0xc0]")
+ "5:\n"
+ "beq 2f\n"
+
+ // Vector 12
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xc0]\n"
+ "fmla v20.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 13
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xd0]\n"
+ "fmla v21.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 14
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xe0]\n"
+ "fmla v22.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 15
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0xf0]\n"
+ "fmla v23.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 6f\n" ASM_PREFETCH("[%[pf_ptr], #0x100]")
+ "6:\n"
+ "beq 2f\n"
+
+ // Vector 16
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x100]\n"
+ "fmla v24.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 17
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x110]\n"
+ "fmla v25.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 18
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x120]\n"
+ "fmla v26.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 19
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x130]\n"
+ "fmla v27.4s, v7.4s, v0.4s\n"
+ // Prefetch
+ "cbz %w[dopf], 7f\n" ASM_PREFETCH("[%[pf_ptr], #0x140]")
+ "7:\n"
+ "beq 2f\n"
+
+ // Vector 20
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x140]\n"
+ "fmla v28.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 21
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x150]\n"
+ "fmla v29.4s, v7.4s, v0.4s\n"
+ "beq 2f\n"
+ // Vector 22
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7,[%[a_ptr], #0x160]\n"
+ "fmla v30.4s, v7.4s, v0.4s\n"
+
+ "2:\n"
+ "add %[a_ptr], %[a_ptr], %[jump]\n"
+
+ // Do the odd 2-vector, if needed
+ "cbz %[odd2_aptr], 8f\n"
+ "ldr d7, [%[odd2_aptr]]\n"
+ "fmla v6.2s, v7.2s, v0.2s\n"
+ "add %[odd2_aptr], %[odd2_aptr], %[jump]\n"
+
+ "8:\n"
+ // Do the odd 1-vector, if needed
+ "cbz %[odd1_aptr], 9f\n"
+ "ldr s7, [%[odd1_aptr]]\n"
+ "fmla v5.2s, v7.2s, v0.2s\n"
+ "add %[odd1_aptr], %[odd1_aptr], %[jump]\n"
+
+ // Get out if needed.
+ "9:\n"
+ "subs %w[k], %w[k], #1\n"
+ "beq 10f\n"
+
+ // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
+ "add %[pf_ptr], %[pf_ptr], %[jump]\n"
+ "cmp %[pf_ptr], %[pf_limit]\n"
+ "csel %w[dopf], %w[dopf], WZR, LT\n"
+
+ // Update the "leading" prefetch pointer, don't do the first
+ // instruction of the loop if it's over the limit.
+ "add %[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
+ "cmp %[firstpf_ptr], %[pf_limit]\n"
+ "blt 1b\n"
+ "b 11b\n"
+
+ // Now write out the outputs
+ "10:\n"
+ "cbz %w[numvecs], 12f\n"
+ "mov %w[vecs], %w[numvecs]\n"
+
+ // Vector 0
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v8.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 1
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v9.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 2
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v10.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 3
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v11.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 4
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v12.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 5
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v13.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 6
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v14.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 7
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v15.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 8
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v16.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 9
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v17.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 10
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v18.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 11
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v19.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 12
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v20.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 13
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v21.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 14
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v22.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 15
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v23.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 16
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v24.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 17
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v25.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 18
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v26.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 19
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v27.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 20
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v28.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 21
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v29.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+ "beq 12f\n"
+ // Vector 22
+ "subs %w[vecs], %w[vecs], #1\n"
+ "ldr q7, [%[y_ptr]]\n"
+ "fmla v7.4s, v30.4s, %[va].4s\n"
+ "str q7, [%[y_ptr]], #0x10\n"
+
+ // Odd 2
+ "12:\n"
+ "cbz %[odd2_aptr], 13f\n"
+ "ldr d7, [%[y_ptr]]\n"
+ "fmla v7.2s, v6.2s, %[va].2s\n"
+ "str d7, [%[y_ptr]], #0x8\n"
+
+ // Odd 1
+ "13:\n"
+ "cbz %[odd1_aptr], 14f\n"
+ "ldr s7, [%[y_ptr]]\n"
+ "fmla v7.2s, v5.2s, %[va].2s\n"
+ "str s7, [%[y_ptr]]\n"
+
+ "14:\n"
+ : [a_ptr] "+r"(a_ptr), [x_ptr] "+r"(x_ptr), [y_ptr] "+r"(y_ptr), [k] "+r"(k),
+ [pf_ptr] "+r"(pf_ptr), [firstpf_ptr] "+r"(firstpf_ptr),
+ [odd1_aptr] "+r"(odd1_aptr), [odd2_aptr] "+r"(odd2_aptr),
+ [dopf] "+r"(dopf), [vecs] "+r"(vecs)
+ : [jump] "r"(jump), [va] "w"(va), [pf_limit] "r"(pf_limit), [numvecs] "r"(numvecs)
+ : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13",
+ "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27", "v28", "v29", "v30", "v31", "cc");
+ }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
index 6731480fca..2ab01d680c 100644
--- a/arm_compute/core/NEON/kernels/assembly/mergeresults.hpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,28 +23,42 @@
*/
#pragma once
-template<unsigned int width, unsigned int height, typename Tin, typename Tout>
-void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta) {
+/* As some of the merges need these headers, but are all included in the
+ * arm_gemm namespace, put these headers here. */
+#include <arm_neon.h>
+
+#include "asmlib.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm
+{
+template <unsigned int width, unsigned int height, typename Tin, typename Tout>
+inline void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout alpha, const Tout beta)
+{
int full_y_blocks = (ymax - y0) / height;
- int y_remainder = (ymax - y0) % height;
- int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
+ int y_remainder = (ymax - y0) % height;
+ int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
int full_x_blocks = (xmax - x0) / width;
- int x_remainder = (xmax - x0) % width;
- int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
+ int x_remainder = (xmax - x0) % width;
+ int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
- for (int y_block = 0; y_block < y_blocks; y_block++) {
+ for(int y_block = 0; y_block < y_blocks; y_block++)
+ {
int ybase = y0 + (y_block * height);
int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
- for (int x_block = 0; x_block < x_blocks; x_block++) {
+ for(int x_block = 0; x_block < x_blocks; x_block++)
+ {
int xbase = x0 + (x_block * width);
int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
- for (int row=0; row < fill_rows; row++) {
- for (int col=0; col < fill_cols; col++) {
+ for(int row = 0; row < fill_rows; row++)
+ {
+ for(int col = 0; col < fill_cols; col++)
+ {
Tout &p = out[(ybase + row) * ldc + xbase + col];
p = (p * alpha) + (beta * in[row * width + col]);
@@ -57,3 +71,5 @@ void MergeResults(Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, i
}
#include "merges/list.hpp"
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
new file mode 100644
index 0000000000..b44e56499f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<8, 6>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+
+ for(int i = x0; i < xmax; i += 8)
+ {
+ float dummyres[8];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 5) >= ymax)
+ {
+ switch((y + 5) - ymax)
+ {
+ case 4:
+ outptr1 = dummyres;
+ case 3:
+ outptr2 = dummyres;
+ case 2:
+ outptr3 = dummyres;
+ case 1:
+ outptr4 = dummyres;
+ case 0:
+ outptr5 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 7) >= xmax)
+ {
+ for(int xi = 0; xi < 8; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 8]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 16]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 24]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 32]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 40]) + (*outptr5 * beta);
+ outptr5++;
+ }
+ }
+ inptr += 48;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Rows 0-1
+ "VLD1.32 {d8-d11}, [%[outptr0]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr1]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[inptr], #352]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr0]]!\n" ASM_PREFETCH("[%[inptr], #416]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[inptr], #480]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr1]]!\n"
+
+ // Rows 2-3
+ "VLD1.32 {d8-d11}, [%[outptr2]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr3]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr0], #96]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr2]]!\n" ASM_PREFETCH("[%[outptr1], #96]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr2], #96]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr3]]!\n"
+
+ // Rows 4-5
+ "VLD1.32 {d8-d11}, [%[outptr4]]\n"
+ "VMUL.f32 q4, q4, %q[bv]\n"
+ "VLD1.32 {d12-d15}, [%[outptr5]]\n"
+ "VMUL.f32 q5, q5, %q[bv]\n"
+ "VLD1.32 {d0-d3}, [%[inptr]]!\n"
+ "VMUL.f32 q6, q6, %q[bv]\n"
+ "VLD1.32 {d4-d7}, [%[inptr]]!\n"
+ "VMUL.f32 q7, q7, %q[bv]\n"
+
+ "VMLA.f32 q4, q0, %q[av]\n" ASM_PREFETCH("[%[outptr3], #96]")
+ "VMLA.f32 q5, q1, %q[av]\n"
+ "VST1.32 {d8-d11}, [%[outptr4]]!\n" ASM_PREFETCH("[%[outptr4], #96]") "VMLA.f32 q6, q2, %q[av]\n" ASM_PREFETCH("[%[outptr5], #128]")
+ "VMLA.f32 q7, q3, %q[av]\n"
+ "VST1.32 {d12-d15}, [%[outptr5]]!\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [inptr] "+r"(inptr)
+ : [av] "w"(av), [bv] "w"(bv)
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+ }
+ }
+ }
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
new file mode 100644
index 0000000000..3b59a43c52
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_12x8.hpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ float *outptr0 = out + (y * ldout) + x0;
+ float *outptr1 = outptr0 + ldout;
+ float *outptr2 = outptr1 + ldout;
+ float *outptr3 = outptr2 + ldout;
+ float *outptr4 = outptr3 + ldout;
+ float *outptr5 = outptr4 + ldout;
+ float *outptr6 = outptr5 + ldout;
+ float *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 12)
+ {
+ float dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 11) >= xmax)
+ {
+ for(int xi = 0; xi < 12; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Rows 0-1
+ "LDP q16, q17, [%[outptr0]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr0], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr1]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr1], #32]\n" ASM_PREFETCH("[%[inptr], #768]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #832]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr0]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr0]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr1]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr1]], #16\n"
+
+ // Rows 2-3
+ "LDP q16, q17, [%[outptr2]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr2], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr3]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr3], #32]\n" ASM_PREFETCH("[%[inptr], #960]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1024]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr2]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr2]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[inptr], #1088]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr3]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr3]], #16\n"
+
+ // Rows 4-5
+ ASM_PREFETCH("[%[outptr0], #80]")
+ "LDP q16, q17, [%[outptr4]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr4], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr5]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr5], #32]\n" ASM_PREFETCH("[%[outptr1], #80]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr2], #80]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr4]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr4]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr3], #80]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr5]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr5]], #16\n"
+
+ // Rows 6-7
+ ASM_PREFETCH("[%[outptr4], #80]")
+ "LDP q16, q17, [%[outptr6]]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDR q18, [%[outptr6], #32]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDP q19, q20, [%[outptr7]]\n"
+ "FMUL v18.4s, v18.4s, %[bv].4s\n"
+ "LDR q21, [%[outptr7], #32]\n" ASM_PREFETCH("[%[outptr5], #80]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr6], #128]")
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "STP q16, q17, [%[outptr6]], #32\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q18, [%[outptr6]], #16\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n" ASM_PREFETCH("[%[outptr7], #128]")
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "STP q19, q20, [%[outptr7]], #32\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "STR q21, [%[outptr7]], #16\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [av] "w"(av), [bv] "w"(bv)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+ }
+ }
+ }
+}
+
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
new file mode 100644
index 0000000000..12a090112d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_float_to_half_12x8.hpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+template <>
+inline void MergeResults<12, 8>(__fp16 *out, const float *in, int ldout, int y0, int ymax, int x0, int xmax, const __fp16 alpha, const __fp16 beta)
+{
+ const float *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 24);
+
+ float32x4_t av = vdupq_n_f32(alpha);
+ float32x4_t bv = vdupq_n_f32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ __fp16 *outptr0 = out + (y * ldout) + x0;
+ __fp16 *outptr1 = outptr0 + ldout;
+ __fp16 *outptr2 = outptr1 + ldout;
+ __fp16 *outptr3 = outptr2 + ldout;
+ __fp16 *outptr4 = outptr3 + ldout;
+ __fp16 *outptr5 = outptr4 + ldout;
+ __fp16 *outptr6 = outptr5 + ldout;
+ __fp16 *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 12)
+ {
+ __fp16 dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 11) >= xmax)
+ {
+ for(int xi = 0; xi < 12; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Rows 0-1
+ "LDR q16, [%[outptr0]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr0], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr1]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr1], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #768]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #832]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #896]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[inptr], #960]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr0]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr0]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr1]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr1]], #8\n"
+
+ // Rows 2-3
+ "LDR q16, [%[outptr2]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr2], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr3]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr3], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[inptr], #1024]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[inptr], #1088]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr0], #64]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr1], #64]")
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr2]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr2]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr3]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr3]], #8\n"
+
+ // Rows 4-5
+ "LDR q16, [%[outptr4]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr4], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr5]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr5], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr2], #64]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr3], #64]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr4], #88]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr4]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr4]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr5]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr5]], #8\n"
+
+ // Rows 6-7
+ "LDR q16, [%[outptr6]]\n"
+ "FCVTL2 v17.4s, v16.8h\n"
+ "LDR d18, [%[outptr6], #16]\n"
+ "FCVTL v16.4s, v16.4h\n"
+ "LDR q19, [%[outptr7]]\n"
+ "FMUL v17.4s, v17.4s, %[bv].4s\n"
+ "LDR d21, [%[outptr7], #16]\n"
+ "FMUL v16.4s, v16.4s, %[bv].4s\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FCVTL v18.4s, v18.4h\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FCVTL2 v20.4s, v19.8h\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FCVTL v19.4s, v19.4h\n" ASM_PREFETCH("[%[outptr5], #64]") "FCVTL v21.4s, v21.4h\n" ASM_PREFETCH("[%[outptr6], #88]") "FMUL v18.4s, v18.4s, %[bv].4s\n" ASM_PREFETCH("[%[outptr7], #88]")
+ "FMUL v20.4s, v20.4s, %[bv].4s\n"
+ "FMUL v19.4s, v19.4s, %[bv].4s\n"
+ "FMUL v21.4s, v21.4s, %[bv].4s\n"
+ "FMLA v16.4s, v0.4s, %[av].4s\n"
+ "FMLA v17.4s, v1.4s, %[av].4s\n"
+ "FCVTN v16.4h, v16.4s\n"
+ "FCVTN2 v16.8h, v17.4s\n"
+ "FMLA v18.4s, v2.4s, %[av].4s\n"
+ "STR q16, [%[outptr6]], #16\n"
+ "FCVTN v18.4h, v18.4s\n"
+ "STR d18, [%[outptr6]], #8\n"
+ "FMLA v19.4s, v3.4s, %[av].4s\n"
+ "FMLA v20.4s, v4.4s, %[av].4s\n"
+ "FCVTN v19.4h, v19.4s\n"
+ "FCVTN2 v19.8h, v20.4s\n"
+ "STR q19, [%[outptr7]], #16\n"
+ "FMLA v21.4s, v5.4s, %[av].4s\n"
+ "FCVTN v21.4h, v21.4s\n"
+ "STR d21, [%[outptr7]], #8\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [av] "w"(av), [bv] "w"(bv)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+ }
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
new file mode 100644
index 0000000000..08cfc00523
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_half_24x8.hpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+template <>
+inline void MergeResults<24, 8>(__fp16 *out, const __fp16 *in, const int ldout, const int y0, const int ymax,
+ const int x0, const int xmax, const __fp16 alpha, const __fp16 beta)
+{
+ const __fp16 *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 48);
+
+ float16x8_t va = vdupq_n_f16(alpha);
+ float16x8_t vb = vdupq_n_f16(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ __fp16 *outptr0 = out + (y * ldout) + x0;
+ __fp16 *outptr1 = outptr0 + ldout;
+ __fp16 *outptr2 = outptr1 + ldout;
+ __fp16 *outptr3 = outptr2 + ldout;
+ __fp16 *outptr4 = outptr3 + ldout;
+ __fp16 *outptr5 = outptr4 + ldout;
+ __fp16 *outptr6 = outptr5 + ldout;
+ __fp16 *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 24)
+ {
+ __fp16 dummyres[24];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 23) >= xmax)
+ {
+ for(int xi = 0; xi < 24; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 24]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 48]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 72]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 96]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 120]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 144]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 168]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 192;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ ".arch armv8.2-a+fp16\n"
+ // Rows 0-1
+ "LDP q16, q17, [%[outptr0]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr0], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr1]]\n"
+ "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #768]")
+ "LDR q21, [%[outptr1], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr]]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #32]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #64]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #832]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr0]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr0]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #896]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr1]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr1]], #16\n" ASM_PREFETCH("[%[inptr], #960]")
+
+ // Rows 2-3
+ "LDP q16, q17, [%[outptr2]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr2], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr3]]\n"
+ "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[inptr], #1024]")
+ "LDR q21, [%[outptr3], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr], #96]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #128]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #160]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[inptr], #1088]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr2]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr2]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr0], #80]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr3]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr3]], #16\n" ASM_PREFETCH("[%[outptr1], #80]")
+
+ // Rows 4-5
+ "LDP q16, q17, [%[outptr4]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr4], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr5]]\n"
+ "FMUL v18.8h, v18.8h, %[vb].8h\n" ASM_PREFETCH("[%[outptr2], #80]")
+ "LDR q21, [%[outptr5], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr], #192]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #224]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #256]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr3], #80]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr4]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr4]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr4], #80]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr5]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr5]], #16\n"
+
+ // Rows 6-7
+ "LDP q16, q17, [%[outptr6]]\n"
+ "FMUL v16.8h, v16.8h, %[vb].8h\n"
+ "LDR q18, [%[outptr6], #32]\n"
+ "FMUL v17.8h, v17.8h, %[vb].8h\n"
+ "LDP q19, q20, [%[outptr7]]\n" ASM_PREFETCH("[%[outptr5], #80]")
+ "FMUL v18.8h, v18.8h, %[vb].8h\n"
+ "LDR q21, [%[outptr7], #32]\n"
+ "FMUL v19.8h, v19.8h, %[vb].8h\n"
+ "LDP q0, q1, [%[inptr], #288]\n"
+ "FMUL v20.8h, v20.8h, %[vb].8h\n"
+ "LDP q2, q3, [%[inptr], #320]\n"
+ "FMUL v21.8h, v21.8h, %[vb].8h\n"
+ "LDP q4, q5, [%[inptr], #352]\n"
+ "FMLA v16.8h, v0.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr6], #128]")
+ "FMLA v17.8h, v1.8h, %[va].8h\n"
+ "STP q16, q17, [%[outptr6]], #32\n"
+ "FMLA v18.8h, v2.8h, %[va].8h\n"
+ "STR q18, [%[outptr6]], #16\n"
+ "FMLA v19.8h, v3.8h, %[va].8h\n" ASM_PREFETCH("[%[outptr7], #128]")
+ "FMLA v20.8h, v4.8h, %[va].8h\n"
+ "STP q19, q20, [%[outptr7]], #32\n"
+ "FMLA v21.8h, v5.8h, %[va].8h\n"
+ "STR q21, [%[outptr7]], #16\n"
+ "ADD %[inptr], %[inptr], #384\n"
+ : [outptr0] "+r"(outptr0), [outptr1] "+r"(outptr1), [outptr2] "+r"(outptr2), [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4), [outptr5] "+r"(outptr5), [outptr6] "+r"(outptr6), [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [va] "w"(va), [vb] "w"(vb)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21");
+ }
+ }
+ }
+}
+
+#endif // __aarch64__ && __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
new file mode 100644
index 0000000000..dc247aad37
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+template <>
+inline void MergeResults<12, 8>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t alpha, const int32_t beta)
+{
+ const int32_t *inptr = in;
+ prefetch_6x(inptr);
+ prefetch_6x(inptr + 96);
+
+ int32x4_t alpha_value = vdupq_n_s32(alpha);
+ int32x4_t beta_value = vdupq_n_s32(beta);
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ int32_t *outptr0 = out + (y * ldout) + x0;
+ int32_t *outptr1 = outptr0 + ldout;
+ int32_t *outptr2 = outptr1 + ldout;
+ int32_t *outptr3 = outptr2 + ldout;
+ int32_t *outptr4 = outptr3 + ldout;
+ int32_t *outptr5 = outptr4 + ldout;
+ int32_t *outptr6 = outptr5 + ldout;
+ int32_t *outptr7 = outptr6 + ldout;
+
+ prefetch_2x(outptr0);
+ prefetch_2x(outptr1);
+ prefetch_2x(outptr2);
+ prefetch_2x(outptr3);
+ prefetch_2x(outptr4);
+ prefetch_2x(outptr5);
+ prefetch_2x(outptr6);
+ prefetch_2x(outptr7);
+
+ for(int i = x0; i < xmax; i += 12)
+ {
+ int32_t dummyres[12];
+
+ /* Make sure we throw away results if Y isn't a multiple of 8.
+ * We do this by pointing the result pointer at a dummy buffer
+ * we later discard. */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ case 6:
+ outptr1 = dummyres;
+ case 5:
+ outptr2 = dummyres;
+ case 4:
+ outptr3 = dummyres;
+ case 3:
+ outptr4 = dummyres;
+ case 2:
+ outptr5 = dummyres;
+ case 1:
+ outptr6 = dummyres;
+ case 0:
+ outptr7 = dummyres;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ /* For ragged X, manually copy over the valid results. */
+ if((i + 11) >= xmax)
+ {
+ for(int xi = 0; xi < 12; xi++)
+ {
+ if((i + xi) < xmax)
+ {
+ *outptr0 = (alpha * inptr[xi]) + (*outptr0 * beta);
+ outptr0++;
+ *outptr1 = (alpha * inptr[xi + 12]) + (*outptr1 * beta);
+ outptr1++;
+ *outptr2 = (alpha * inptr[xi + 24]) + (*outptr2 * beta);
+ outptr2++;
+ *outptr3 = (alpha * inptr[xi + 36]) + (*outptr3 * beta);
+ outptr3++;
+ *outptr4 = (alpha * inptr[xi + 48]) + (*outptr4 * beta);
+ outptr4++;
+ *outptr5 = (alpha * inptr[xi + 60]) + (*outptr5 * beta);
+ outptr5++;
+ *outptr6 = (alpha * inptr[xi + 72]) + (*outptr6 * beta);
+ outptr6++;
+ *outptr7 = (alpha * inptr[xi + 84]) + (*outptr7 * beta);
+ outptr7++;
+ }
+ }
+ inptr += 96;
+ }
+ else
+ {
+ /* Optimized routine to copy an entire block */
+ __asm __volatile(
+ // Row 0
+ ASM_PREFETCH("[%x[outptr1], #192]")
+ "ldr q3, [%x[outptr0]]\n"
+ "ldr q4, [%x[outptr0], #0x10]\n"
+ "ldr q5, [%x[outptr0], #0x20]\n"
+ "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr]]\n"
+ "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0x10]\n"
+ "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0x20]\n"
+ "mla v3.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q0, [%x[outptr1]]\n"
+ "mla v4.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q1, [%x[outptr1], #0x10]\n"
+ "mla v5.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q2, [%x[outptr1], #0x20]\n"
+
+ // Row 1
+ ASM_PREFETCH("[%x[outptr2], #192]")
+ "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0x30]\n"
+ "str q3, [%x[outptr0]], #0x10\n"
+ "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0x40]\n"
+ "str q4, [%x[outptr0]], #0x10\n"
+ "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0x50]\n"
+ "str q5, [%x[outptr0]], #0x10\n"
+ "mla v0.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q3, [%x[outptr2]]\n"
+ "mla v1.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q4, [%x[outptr2], #0x10]\n"
+ "mla v2.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q5, [%x[outptr2], #0x20]\n"
+
+ // Row 2
+ ASM_PREFETCH("[%x[outptr3], #192]")
+ "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0x60]\n"
+ "str q0, [%x[outptr1]], #0x10\n"
+ "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0x70]\n"
+ "str q1, [%x[outptr1]], #0x10\n"
+ "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0x80]\n"
+ "str q2, [%x[outptr1]], #0x10\n"
+ "mla v3.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q0, [%x[outptr3]]\n"
+ "mla v4.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q1, [%x[outptr3], #0x10]\n"
+ "mla v5.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q2, [%x[outptr3], #0x20]\n"
+
+ // Row 3
+ ASM_PREFETCH("[%x[outptr4], #192]")
+ "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0x90]\n"
+ "str q3, [%x[outptr2]], #0x10\n"
+ "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0xa0]\n"
+ "str q4, [%x[outptr2]], #0x10\n"
+ "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0xb0]\n"
+ "str q5, [%x[outptr2]], #0x10\n"
+ "mla v0.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q3, [%x[outptr4]]\n"
+ "mla v1.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q4, [%x[outptr4], #0x10]\n"
+ "mla v2.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q5, [%x[outptr4], #0x20]\n"
+
+ // Row 4
+ ASM_PREFETCH("[%x[outptr5], #192]")
+ "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0xc0]\n"
+ "str q0, [%x[outptr3]], #0x10\n"
+ "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0xd0]\n"
+ "str q1, [%x[outptr3]], #0x10\n"
+ "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0xe0]\n"
+ "str q2, [%x[outptr3]], #0x10\n"
+ "mla v3.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q0, [%x[outptr5]]\n"
+ "mla v4.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q1, [%x[outptr5], #0x10]\n"
+ "mla v5.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q2, [%x[outptr5], #0x20]\n"
+
+ // Row 5
+ ASM_PREFETCH("[%x[outptr6], #192]")
+ "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0xf0]\n"
+ "str q3, [%x[outptr4]], #0x10\n"
+ "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0x100]\n"
+ "str q4, [%x[outptr4]], #0x10\n"
+ "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0x110]\n"
+ "str q5, [%x[outptr4]], #0x10\n"
+ "mla v0.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q3, [%x[outptr6]]\n"
+ "mla v1.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q4, [%x[outptr6], #0x10]\n"
+ "mla v2.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q5, [%x[outptr6], #0x20]\n"
+
+ // Row 6
+ ASM_PREFETCH("[%x[outptr7], #192]")
+ "mul v3.4s, v3.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0x120]\n"
+ "str q0, [%x[outptr5]], #0x10\n"
+ "mul v4.4s, v4.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0x130]\n"
+ "str q1, [%x[outptr5]], #0x10\n"
+ "mul v5.4s, v5.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0x140]\n"
+ "str q2, [%x[outptr5]], #0x10\n"
+ "mla v3.4s, v6.4s, %[beta_value].4s\n"
+ "ldr q0, [%x[outptr7]]\n"
+ "mla v4.4s, v7.4s, %[beta_value].4s\n"
+ "ldr q1, [%x[outptr7], #0x10]\n"
+ "mla v5.4s, v8.4s, %[beta_value].4s\n"
+ "ldr q2, [%x[outptr7], #0x20]\n"
+
+ // Row 7
+ "mul v0.4s, v0.4s, %[alpha_value].4s\n"
+ "ldr q6, [%x[inptr], #0x150]\n"
+ "str q3, [%x[outptr6]], #0x10\n"
+ "mul v1.4s, v1.4s, %[alpha_value].4s\n"
+ "ldr q7, [%x[inptr], #0x160]\n"
+ "str q4, [%x[outptr6]], #0x10\n"
+ "mul v2.4s, v2.4s, %[alpha_value].4s\n"
+ "ldr q8, [%x[inptr], #0x170]\n"
+ "str q5, [%x[outptr6]], #0x10\n"
+ "mla v0.4s, v6.4s, %[beta_value].4s\n"
+ "mla v1.4s, v7.4s, %[beta_value].4s\n"
+ "mla v2.4s, v8.4s, %[beta_value].4s\n"
+ "str q0, [%x[outptr7]], #0x10\n"
+ "str q1, [%x[outptr7]], #0x10\n"
+ "str q2, [%x[outptr7]], #0x10\n"
+
+ "add %x[inptr], %x[inptr], #0x180\n"
+ : [outptr0] "+r"(outptr0),
+ [outptr1] "+r"(outptr1),
+ [outptr2] "+r"(outptr2),
+ [outptr3] "+r"(outptr3),
+ [outptr4] "+r"(outptr4),
+ [outptr5] "+r"(outptr5),
+ [outptr6] "+r"(outptr6),
+ [outptr7] "+r"(outptr7),
+ [inptr] "+r"(inptr)
+ : [alpha_value] "w"(alpha_value),
+ [beta_value] "w"(beta_value)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
+ }
+ }
+ }
+}
+
+template <>
+inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta)
+{
+ // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
+ MergeResults<12, 8>(reinterpret_cast<int32_t *>(out), reinterpret_cast<const int32_t *>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index 29b915a75d..7d56e58f44 100644
--- a/arm_compute/core/NEON/kernels/assembly/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -21,8 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
+
#include "a32_merge_float_8x6.hpp"
#include "a64_merge_float_12x8.hpp"
-//#include "a64_merge_float_to_half_12x8.hpp"
-//#include "a64_merge_half_24x8.hpp"
-//#include "a64_merge_int32_12x8.hpp"
+#include "a64_merge_float_to_half_12x8.hpp"
+#include "a64_merge_half_24x8.hpp"
+#include "a64_merge_int32_12x8.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
new file mode 100644
index 0000000000..b29cc58d5d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <newgemm_lib.hpp>
+
+unsigned int get_cpu_impl()
+{
+#ifndef BARE_METAL
+ int fd = open("/proc/cpuinfo", 0);
+ char buff[3000];
+ char *pos;
+ char *end;
+ int foundid = 0;
+ int variant = 0;
+
+ int cpu = sched_getcpu();
+
+ if(!fd)
+ {
+ return 0;
+ }
+
+ int charsread = read(fd, buff, 3000);
+ pos = buff;
+ end = buff + charsread;
+
+ close(fd);
+
+ /* So, to date I've encountered two formats for /proc/cpuinfo.
+ *
+ * One of them just lists processor : n for each processor (with no
+ * other info), then at the end lists part information for the current
+ * CPU.
+ *
+ * The other has an entire clause (including part number info) for each
+ * CPU in the system, with "processor : n" headers.
+ *
+ * We can cope with either of these formats by waiting to see
+ * "processor: n" (where n = our CPU ID), and then looking for the next
+ * "CPU part" field.
+ */
+ while(pos < end)
+ {
+ if(foundid && !strncmp(pos, "CPU variant", 11))
+ {
+ pos += 13;
+ char *resume = end; // Need to continue scanning after this
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ resume = ch + 1;
+ break;
+ }
+ }
+
+ variant = strtoul(pos, NULL, 0);
+
+ pos = resume;
+ }
+
+ if(foundid && !strncmp(pos, "CPU part", 8))
+ {
+ /* Found part number */
+ pos += 11;
+ unsigned int num;
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ break;
+ }
+ }
+
+ num = strtoul(pos, NULL, 0);
+
+ return (num << 4) | (variant << 20);
+ }
+
+ if(!strncmp(pos, "processor", 9))
+ {
+ /* Found processor ID, see if it's ours. */
+ pos += 11;
+ int num;
+
+ for(char *ch = pos; ch < end; ch++)
+ {
+ if(*ch == '\n')
+ {
+ *ch = '\0';
+ break;
+ }
+ }
+
+ num = strtol(pos, NULL, 0);
+
+ if(num == cpu)
+ {
+ foundid = 1;
+ }
+ }
+
+ while(pos < end)
+ {
+ char ch = *pos++;
+ if(ch == '\n' || ch == '\0')
+ {
+ break;
+ }
+ }
+ }
+#endif
+
+ return 0;
+}
+
+CPUInfo *get_CPUInfo()
+{
+ static CPUInfo ci;
+
+ return &ci;
+}
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/src/core/NEON/kernels/arm_gemm/profiler.hpp
index f7a1d1c70c..c38b0a443c 100644
--- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp
+++ b/src/core/NEON/kernels/arm_gemm/profiler.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,57 +27,83 @@
#include "../perf.h"
-class profiler {
+#ifndef NO_MULTI_THREADING
+#include <mutex>
+#endif
+
+namespace arm_gemm
+{
+#ifndef NO_MULTI_THREADING
+extern std::mutex report_mutex;
+#endif
+
+class profiler
+{
private:
- static const int maxevents = 10000;
- unsigned long times[maxevents];
- unsigned long units[maxevents];
- int events[maxevents];
- int currentevent;
- int countfd;
+ static const int maxevents = 100000;
+ unsigned long times[maxevents] = {};
+ unsigned long units[maxevents] = {};
+ int events[maxevents] = {};
+ int currentevent = 0;
+ int countfd = 0;
public:
- profiler() {
- currentevent=0;
- countfd=open_cycle_counter();
+ profiler()
+ {
+ countfd = open_cycle_counter();
}
- ~profiler() {
+ ~profiler()
+ {
close(countfd);
- int tots[5];
+ int tots[5];
unsigned long counts[5];
unsigned long tunits[5];
- const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
+ const char *descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
- for (int i=1; i<5; i++) {
- tots[i] = 0;
+ for(int i = 1; i < 5; i++)
+ {
+ tots[i] = 0;
counts[i] = 0;
tunits[i] = 0;
}
- printf("Profiled events:\n");
- for (int i=0; i<currentevent; i++) {
+ for(int i = 0; i < currentevent; i++)
+ {
+ // printf("%10s: %ld\n", descs[events[i]-1], times[i]);
tots[events[i]]++;
counts[events[i]] += times[i];
tunits[events[i]] += units[i];
}
+#ifdef NO_MULTI_THREADING
+ printf("Profiled events:\n");
+#else
+ std::lock_guard<std::mutex> lock(report_mutex);
+ printf("Profiled events (cpu %d):\n", sched_getcpu());
+#endif
+
printf("%20s %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
- for (int i=1; i<5; i++) {
- printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]);
+ for(int i = 1; i < 5; i++)
+ {
+ printf("%20s: %9d %9ld %9ld %12lu %9.2f\n", descs[i - 1], tots[i], counts[i], counts[i] / tots[i], tunits[i], (float)tunits[i] / counts[i]);
}
}
template <typename T>
- void operator() (int i, unsigned long u, T func) {
- if (currentevent==maxevents) {
+ void operator()(int i, unsigned long u, T func)
+ {
+ if(currentevent == maxevents)
+ {
func();
- } else {
+ }
+ else
+ {
events[currentevent] = i;
- units[currentevent] = u;
+ units[currentevent] = u;
start_counter(countfd);
func();
- long long cycs = stop_counter(countfd);
+ long long cycs = stop_counter(countfd);
times[currentevent++] = cycs;
}
}
@@ -85,19 +111,23 @@ public:
#else
-class profiler {
+namespace arm_gemm
+{
+class profiler
+{
public:
template <typename T>
- void operator() (int i, unsigned long u, T func) {
+ void operator()(int i, unsigned long u, T func)
+ {
func();
}
};
-#endif
+#endif // CYCLE_PROFILING
+
+} // namespace arm_gemm
#define PROFILE_PREPA 1
#define PROFILE_PREPB 2
#define PROFILE_KERNEL 3
#define PROFILE_MERGE 4
-
-
diff --git a/arm_compute/core/NEON/kernels/assembly/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index 717506f54c..c80bb59941 100644
--- a/arm_compute/core/NEON/kernels/assembly/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,51 +35,63 @@
* being a multiple of the block sizes.
*/
template <unsigned IntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize>
-struct TransformImpl {
+struct TransformImpl
+{
template <typename TOut, typename TIn>
- static void Transform(TOut* out, const TIn* const in, const int stride,
- const int y0, const int ymax, const int x0, const int xmax) {
+ static void Transform(TOut *out, const TIn *const in, const int stride,
+ const int y0, const int ymax, const int x0, const int xmax)
+ {
const int n_whole_y_blocks = (ymax - y0) / IntBy;
- const int y_remainders = (ymax - y0) % IntBy;
- const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
+ const int y_remainders = (ymax - y0) % IntBy;
+ const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
const int n_whole_x_blocks = (xmax - x0) / BlockBy;
- const int x_remainders = (xmax - x0) % BlockBy;
- const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
+ const int x_remainders = (xmax - x0) % BlockBy;
+ const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
// "Y" loop: advance down the rows of the source IntBy rows at a time.
// Set up fill_rows to show the number rows to copy from, and blank_rows
// for the number of blank rows to add.
- for (int y_block=0 ; y_block < n_y_blocks; y_block++) {
- int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
+ for(int y_block = 0; y_block < n_y_blocks; y_block++)
+ {
+ int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
int blank_rows = IntBy - fill_rows;
int y_base = y0 + (y_block * IntBy);
// So now advance along this block of rows, BlockBy columns at a time.
- for (int x_block=0 ; x_block < n_x_blocks; x_block++) {
- int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
+ for(int x_block = 0; x_block < n_x_blocks; x_block++)
+ {
+ int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
int blank_cols = BlockBy - fill_cols;
int x_base = x0 + (x_block * BlockBy);
- for (int row = 0; row < fill_rows; row++) {
- for (int col = 0; col < fill_cols; col++) {
+ for(int row = 0; row < fill_rows; row++)
+ {
+ for(int col = 0; col < fill_cols; col++)
+ {
// In-range copy. If it's transposed, we reverse the sense of rows and columns here.
- if (Transposed) {
+ if(Transposed)
+ {
*out++ = static_cast<TOut>(in[(x_base + col) * stride + y_base + row]);
- } else {
+ }
+ else
+ {
*out++ = static_cast<TOut>(in[(y_base + row) * stride + x_base + col]);
}
}
// "col" tail - row is in range but column is out of range.
- for (int col=0; col < blank_cols; col++) {
+ for(int col = 0; col < blank_cols; col++)
+ {
*out++ = static_cast<TOut>(0);
}
}
// "row" tail - row is out of range so fill with zeros always.
- for (int row = 0; row < blank_rows; row++) {
- for (int col=0; col < (fill_cols + blank_cols); col++) {
+ for(int row = 0; row < blank_rows; row++)
+ {
+ for(int col = 0; col < (fill_cols + blank_cols); col++)
+ {
*out++ = static_cast<TOut>(0);
}
}
@@ -88,8 +100,9 @@ struct TransformImpl {
}
template <typename T>
- static inline void Transform(T* out, const T* const in, const int stride,
- const int k0, const int kmax, const int x0, const int xmax) {
+ static inline void Transform(T *out, const T *const in, const int stride,
+ const int k0, const int kmax, const int x0, const int xmax)
+ {
Transform<T, T>(out, in, stride, k0, kmax, x0, xmax);
}
};
@@ -97,13 +110,12 @@ struct TransformImpl {
/*****************************************************************************/
template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, typename TOut, typename TIn>
void Transform(
- TOut* out, const TIn* const in, const int stride,
- const int k0, const int kmax, const int x0, const int xmax
-) {
- // Redirect to a specialised implementation predicated on argument size.
- TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
- out, in, stride, k0, kmax, x0, xmax
- );
+ TOut *out, const TIn *const in, const int stride,
+ const int k0, const int kmax, const int x0, const int xmax)
+{
+ // Redirect to a specialised implementation predicated on argument size.
+ TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn)>::Transform(
+ out, in, stride, k0, kmax, x0, xmax);
}
/*****************************************************************************/
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
index 4a1b5d2bf2..f09e5a0e78 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a32_interleave_6way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
@@ -26,17 +26,20 @@
#ifdef __arm__
#include <arm_neon.h>
-#include "asmlib.hpp"
-template<>
-template<typename T>
-inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
- const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
+ const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
uint32_t zerobuff[8];
- for (int y=y0; y<ymax; y+=6) {
+ for(int y = y0; y < ymax; y += 6)
+ {
const uint32_t *inptr0 = inptr + y * ldin + k0;
const uint32_t *inptr1 = inptr0 + ldin;
const uint32_t *inptr2 = inptr1 + ldin;
@@ -51,11 +54,14 @@ inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int
//prefetch_2x(inptr4);
//prefetch_2x(inptr5);
- int x=(kmax-k0);
- for (;x>7;x-=8) {
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
/* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 5) >= ymax) {
- switch ((y + 5) - ymax) {
+ if((y + 5) >= ymax)
+ {
+ switch((y + 5) - ymax)
+ {
/* Everything falls through in here */
case 4:
inptr1 = zerobuff;
@@ -72,73 +78,67 @@ inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int
}
}
-
- __asm __volatile (
+ __asm __volatile(
// Load up 8 elements (2 vectors) from each of 8 sources.
- "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
- "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
- "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
- "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
- "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
- "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
- "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
- "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
- "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+ "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
+ "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
+ "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
+ "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
+ "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+ "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
+ "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
+ "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
+ "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
ASM_PREFETCH("[%[inptr0], #128]")
- "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+ "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
// Store first elements
- "VST1.32 {d0-d1}, [%[outptr]]!\n"
- "VST1.32 {d16}, [%[outptr]]!\n"
+ "VST1.32 {d0-d1}, [%[outptr]]!\n"
+ "VST1.32 {d16}, [%[outptr]]!\n"
- "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+ "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
// Store second elements
- "VST1.32 {d4-d5}, [%[outptr]]!\n"
- "VZIP.32 q1, q5\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "VST1.32 {d17}, [%[outptr]]!\n"
- "VZIP.32 q3, q7\n"
+ "VST1.32 {d4-d5}, [%[outptr]]!\n"
+ "VZIP.32 q1, q5\n" ASM_PREFETCH("[%[inptr1], #128]")
+ "VST1.32 {d17}, [%[outptr]]!\n"
+ "VZIP.32 q3, q7\n"
// Store third elements
- "VZIP.32 q9, q11\n"
- "VST1.32 {d8-d9}, [%[outptr]]!\n"
- "VZIP.32 q1, q3\n"
- ASM_PREFETCH("[%[inptr2], #128]")
- "VST1.32 {d20}, [%[outptr]]!\n"
+ "VZIP.32 q9, q11\n"
+ "VST1.32 {d8-d9}, [%[outptr]]!\n"
+ "VZIP.32 q1, q3\n" ASM_PREFETCH("[%[inptr2], #128]")
+ "VST1.32 {d20}, [%[outptr]]!\n"
// Store fourth elements
- "VZIP.32 q5, q7\n"
- "VST1.32 {d12-d13}, [%[outptr]]!\n"
- ASM_PREFETCH("[%[inptr3], #128]")
- "VST1.32 {d21}, [%[outptr]]!\n"
+ "VZIP.32 q5, q7\n"
+ "VST1.32 {d12-d13}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr3], #128]")
+ "VST1.32 {d21}, [%[outptr]]!\n"
// Fifth
- "VST1.32 {d2-d3}, [%[outptr]]!\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "VST1.32 {d18}, [%[outptr]]!\n"
+ "VST1.32 {d2-d3}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr4], #128]")
+ "VST1.32 {d18}, [%[outptr]]!\n"
// Sixth
- "VST1.32 {d6-d7}, [%[outptr]]!\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "VST1.32 {d19}, [%[outptr]]!\n"
+ "VST1.32 {d6-d7}, [%[outptr]]!\n" ASM_PREFETCH("[%[inptr5], #128]")
+ "VST1.32 {d19}, [%[outptr]]!\n"
// Seventh
- "VST1.32 {d10-d11}, [%[outptr]]!\n"
- "VST1.32 {d22}, [%[outptr]]!\n"
+ "VST1.32 {d10-d11}, [%[outptr]]!\n"
+ "VST1.32 {d22}, [%[outptr]]!\n"
- // Eigth
- "VST1.32 {d14-d15}, [%[outptr]]!\n"
- "VST1.32 {d23}, [%[outptr]]!\n"
+ // Eighth
+ "VST1.32 {d14-d15}, [%[outptr]]!\n"
+ "VST1.32 {d23}, [%[outptr]]!\n"
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [outptr] "+r"(outptr)
:
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
- );
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12");
}
- for (;x>0;x--) {
+ for(; x > 0; x--)
+ {
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
@@ -149,4 +149,4 @@ inline void TransformImpl<6, 1, false, 4, 4>::Transform(T *out, const T *in, int
}
}
-#endif // __arm__
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
new file mode 100644
index 0000000000..ea32c9665c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 8x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, true, 4, 4>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a 16x uint16_t specialisation
+ TransformImpl<16, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 16 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+ __asm volatile(
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ : [in0] "+r"(in0),
+ [out] "+r"(out)
+ :
+ : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+ __asm volatile(
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in1], #192]") "SUB %[out], %[out], #32\n"
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [out] "+r"(out)
+ :
+ : "q0", "q1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+ __asm __volatile(
+ "VLD1.32 {d0-d3}, [%[in0]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in0], #192]")
+ "VLD1.32 {d0-d3}, [%[in1]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in1], #192]")
+ "VLD1.32 {d0-d3}, [%[in2]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]!\n" ASM_PREFETCH("[%[in2], #192]")
+ "VLD1.32 {d0-d3}, [%[in3]]!\n"
+ "VST1.32 {d0-d3}, [%[out]]\n" ASM_PREFETCH("[%[in3], #192]") "SUB %[out], %[out], #96\n"
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [in2] "+r"(in2),
+ [in3] "+r"(in3),
+ [out] "+r"(out)
+ :
+ : "q0", "q1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<16, 1, true, 2, 2>::Transform(
+ uint16_t *out, const uint16_t *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<16, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __arm__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
index ac84567b54..8d61f15cec 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_block16_interleave4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
@@ -26,17 +26,21 @@
#ifdef __aarch64__
#include <arm_neon.h>
-#include "asmlib.hpp"
-template<>
-template<typename T>
-inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint8_t *outptr = (uint8_t *)out;
- const uint8_t *inptr = (uint8_t *)in;
+#include "../asmlib.hpp"
+#include "../utils.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint8_t *outptr = (uint8_t *)out;
+ const uint8_t *inptr = (uint8_t *)in;
uint8_t zerobuff[16];
- for (int y=y0; y<ymax; y+=4) {
+ for(int y = y0; y < ymax; y += 4)
+ {
const uint8_t *inptr0 = inptr + y * ldin + k0;
const uint8_t *inptr1 = inptr0 + ldin;
const uint8_t *inptr2 = inptr1 + ldin;
@@ -47,11 +51,14 @@ inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, in
prefetch_2x(inptr2);
prefetch_2x(inptr3);
- int x=(kmax-k0);
- for (;x>15;x-=16) {
+ int x = (kmax - k0);
+ for(; x > 15; x -= 16)
+ {
/* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 3) >= ymax) {
- switch ((y + 3) - ymax) {
+ if((y + 3) >= ymax)
+ {
+ switch((y + 3) - ymax)
+ {
/* Everything falls through in here */
case 2:
inptr1 = zerobuff;
@@ -59,33 +66,30 @@ inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, in
inptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
- default:
break;
+
+ default:
+ UNREACHABLE("Impossible.");
}
}
- __asm __volatile (
- "LDR q0, [%[inptr0]], #16\n"
- ASM_PREFETCH("[%[inptr0], #176]")
- "LDR q1, [%[inptr1]], #16\n"
- ASM_PREFETCH("[%[inptr1], #176]")
- "STP q0, q1, [%[outptr]], #32\n"
- "LDR q0, [%[inptr2]], #16\n"
- ASM_PREFETCH("[%[inptr2], #176]")
- "LDR q1, [%[inptr3]], #16\n"
- ASM_PREFETCH("[%[inptr3], #176]")
- "STP q0, q1, [%[outptr]], #32\n"
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [outptr] "+r" (outptr)
+ __asm __volatile(
+ "LDR q0, [%[inptr0]], #16\n" ASM_PREFETCH("[%[inptr0], #176]") "LDR q1, [%[inptr1]], #16\n" ASM_PREFETCH("[%[inptr1], #176]")
+ "STP q0, q1, [%[outptr]], #32\n"
+ "LDR q0, [%[inptr2]], #16\n" ASM_PREFETCH("[%[inptr2], #176]") "LDR q1, [%[inptr3]], #16\n" ASM_PREFETCH("[%[inptr3], #176]") "STP q0, q1, [%[outptr]], #32\n"
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [outptr] "+r"(outptr)
:
- : "v0", "v1"
- );
+ : "v0", "v1");
}
- if (x>0) {
+ if(x > 0)
+ {
/* Need to duplicate this here, in case we didn't run the main loop. */
- if ((y + 3) >= ymax) {
- switch ((y + 3) - ymax) {
+ if((y + 3) >= ymax)
+ {
+ switch((y + 3) - ymax)
+ {
/* Everything falls through in here */
case 2:
inptr1 = zerobuff;
@@ -93,17 +97,24 @@ inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, in
inptr2 = zerobuff;
case 0:
inptr3 = zerobuff;
- default:
break;
+
+ default:
+ UNREACHABLE("Impossible.");
}
}
/* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
- auto f = [&outptr, x](const uint8_t *&p) {
- for (int i=0; i<16; i++) {
- if (i < x) {
+ auto f = [&outptr, x](const uint8_t *&p)
+ {
+ for(int i = 0; i < 16; i++)
+ {
+ if(i < x)
+ {
*outptr++ = *p++;
- } else {
+ }
+ else
+ {
*outptr++ = 0;
}
}
@@ -117,4 +128,4 @@ inline void TransformImpl<4, 16, false, 1, 1>::Transform(T *out, const T *in, in
}
}
-#endif // __aarch64__
+#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
new file mode 100644
index 0000000000..3cbc8815e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint16_t *outptr = (uint16_t *)out;
+ const uint16_t *inptr = (const uint16_t *)in;
+
+ uint16_t zerobuff[24];
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ const uint16_t *inptr0 = inptr + y * ldin + k0;
+ const uint16_t *inptr1 = inptr0 + ldin;
+ const uint16_t *inptr2 = inptr1 + ldin;
+ const uint16_t *inptr3 = inptr2 + ldin;
+ const uint16_t *inptr4 = inptr3 + ldin;
+ const uint16_t *inptr5 = inptr4 + ldin;
+ const uint16_t *inptr6 = inptr5 + ldin;
+ const uint16_t *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ /* Everything falls through in here */
+ case 6:
+ inptr1 = zerobuff;
+ case 5:
+ inptr2 = zerobuff;
+ case 4:
+ inptr3 = zerobuff;
+ case 3:
+ inptr4 = zerobuff;
+ case 2:
+ inptr5 = zerobuff;
+ case 1:
+ inptr6 = zerobuff;
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ int skippf = (x & 31);
+ __asm __volatile(
+ // Load up 8 elements (1 vector) from each of 8 sources.
+ "CBNZ %w[skippf], 1f\n" ASM_PREFETCH("[%[inptr0], #128]")
+ ASM_PREFETCH("[%[inptr1], #128]")
+ ASM_PREFETCH("[%[inptr2], #128]")
+ ASM_PREFETCH("[%[inptr3], #128]")
+ "1:\n"
+
+ "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
+ "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
+ "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
+ "LDR q6, [%[inptr6]], #16\n"
+ "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
+ "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
+ "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
+ "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
+ "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
+ "LDR q5, [%[inptr5]], #16\n"
+ "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
+ "LDR q7, [%[inptr7]], #16\n"
+ "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
+ "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
+ "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
+ "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
+
+ "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
+ "ZIP2 v20.8h, v8.8h, v9.8h\n"
+ "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
+ "ZIP2 v21.8h, v10.8h, v11.8h\n"
+
+ "CBNZ %w[skippf], 2f\n" ASM_PREFETCH("[%[inptr4], #112]")
+ ASM_PREFETCH("[%[inptr5], #112]")
+ ASM_PREFETCH("[%[inptr6], #112]")
+ ASM_PREFETCH("[%[inptr7], #112]")
+ "2:\n"
+
+ "ZIP1 v22.8h, v16.8h, v17.8h\n"
+ "ZIP2 v30.8h, v16.8h, v17.8h\n"
+ "ZIP1 v23.8h, v18.8h, v19.8h\n"
+ "ZIP2 v31.8h, v18.8h, v19.8h\n"
+
+ "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
+ "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
+ "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements
+
+ "ZIP1 v0.8h, v20.8h, v21.8h\n"
+ "ZIP2 v1.8h, v20.8h, v21.8h\n"
+ "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements
+
+ "ZIP1 v2.8h, v22.8h, v23.8h\n"
+ "ZIP2 v3.8h, v22.8h, v23.8h\n"
+ "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements
+
+ "ZIP1 v4.8h, v30.8h, v31.8h\n"
+ "ZIP2 v5.8h, v30.8h, v31.8h\n"
+ "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+ : [skippf] "r"(skippf)
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+ "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+ }
+
+ for(; x > 0; x--)
+ {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index bd5125afab..47e4fa2608 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
@@ -26,17 +26,20 @@
#ifdef __aarch64__
#include <arm_neon.h>
-#include "asmlib.hpp"
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint32_t *outptr = (uint32_t *)out;
- const uint32_t *inptr = (uint32_t *)in;
+#include "../asmlib.hpp"
+
+template <>
+template <typename T>
+inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ uint32_t *outptr = (uint32_t *)out;
+ const uint32_t *inptr = (uint32_t *)in;
uint32_t zerobuff[8];
- for (int y=y0; y<ymax; y+=8) {
+ for(int y = y0; y < ymax; y += 8)
+ {
const uint32_t *inptr0 = inptr + y * ldin + k0;
const uint32_t *inptr1 = inptr0 + ldin;
const uint32_t *inptr2 = inptr1 + ldin;
@@ -55,11 +58,14 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
prefetch_2x(inptr6);
prefetch_2x(inptr7);
- int x=(kmax-k0);
- for (;x>7;x-=8) {
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
/* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
/* Everything falls through in here */
case 6:
inptr1 = zerobuff;
@@ -75,25 +81,26 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
inptr6 = zerobuff;
case 0:
inptr7 = zerobuff;
- default:
break;
+
+ default:
+ UNREACHABLE("Impossible.");
}
}
- __asm __volatile (
+ __asm __volatile(
// Load up 8 elements (2 vectors) from each of 8 sources.
"LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
"LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
"LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
- "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+ "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
ASM_PREFETCH("[%[inptr0], #128]")
"LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
- "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+ "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
"LDP q8, q9, [%[inptr4]], #32\n"
"LDP q10, q11, [%[inptr5]], #32\n"
"LDP q12, q13, [%[inptr6]], #32\n"
- "ZIP1 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr1], #128]")
+ "ZIP1 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr1], #128]")
"LDP q14, q15, [%[inptr7]], #32\n"
"ZIP1 v19.4s, v10.4s, v14.4s\n"
@@ -103,8 +110,7 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
- "ZIP2 v16.4s, v0.4s, v4.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
+ "ZIP2 v16.4s, v0.4s, v4.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
"ZIP2 v17.4s, v2.4s, v6.4s\n"
"STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
@@ -112,14 +118,12 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
"ZIP2 v19.4s, v10.4s, v14.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP1 v20.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
"ZIP1 v21.4s, v18.4s, v19.4s\n"
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
- "ZIP1 v16.4s, v1.4s, v5.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP1 v16.4s, v1.4s, v5.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
"ZIP1 v17.4s, v3.4s, v7.4s\n"
"STP q20, q21, [%[outptr]], #32\n" // Third element
@@ -129,16 +133,14 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
"ZIP1 v20.4s, v16.4s, v17.4s\n"
"ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
+ "ZIP2 v22.4s, v16.4s, v17.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
"ZIP2 v23.4s, v18.4s, v19.4s\n"
"ZIP2 v16.4s, v1.4s, v5.4s\n"
"ZIP2 v17.4s, v3.4s, v7.4s\n"
"STP q20, q21, [%[outptr]], #32\n" // Fifth element
- "ZIP2 v18.4s, v9.4s, v13.4s\n"
- ASM_PREFETCH("[%[inptr7], #128]")
+ "ZIP2 v18.4s, v9.4s, v13.4s\n" ASM_PREFETCH("[%[inptr7], #128]")
"ZIP2 v19.4s, v11.4s, v15.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Sixth element
@@ -149,15 +151,15 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
"ZIP2 v22.4s, v16.4s, v17.4s\n"
"ZIP2 v23.4s, v18.4s, v19.4s\n"
"STP q22, q23, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
- );
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
- for (;x>0;x--) {
+ for(; x > 0; x--)
+ {
*outptr++ = *inptr0++;
*outptr++ = *inptr1++;
*outptr++ = *inptr2++;
@@ -170,4 +172,4 @@ inline void TransformImpl<8, 1, false, 4, 4>::Transform(T *out, const T *in, int
}
}
-#endif // __aarch64__
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
new file mode 100644
index 0000000000..85ffdc2d4f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template <>
+template <>
+inline void TransformImpl<8, 1, false, 4, 2>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+ float *outptr = out;
+ const __fp16 *inptr = in;
+
+ __fp16 zerobuff[8];
+
+ for(int y = y0; y < ymax; y += 8)
+ {
+ const __fp16 *inptr0 = inptr + y * ldin + k0;
+ const __fp16 *inptr1 = inptr0 + ldin;
+ const __fp16 *inptr2 = inptr1 + ldin;
+ const __fp16 *inptr3 = inptr2 + ldin;
+ const __fp16 *inptr4 = inptr3 + ldin;
+ const __fp16 *inptr5 = inptr4 + ldin;
+ const __fp16 *inptr6 = inptr5 + ldin;
+ const __fp16 *inptr7 = inptr6 + ldin;
+
+ prefetch_2x(inptr0);
+ prefetch_2x(inptr1);
+ prefetch_2x(inptr2);
+ prefetch_2x(inptr3);
+ prefetch_2x(inptr4);
+ prefetch_2x(inptr5);
+ prefetch_2x(inptr6);
+ prefetch_2x(inptr7);
+
+ int x = (kmax - k0);
+ for(; x > 7; x -= 8)
+ {
+ /* Cope with ragged cases by copying from a buffer of zeroes instead */
+ if((y + 7) >= ymax)
+ {
+ switch((y + 7) - ymax)
+ {
+ /* Everything falls through in here */
+ case 6:
+ inptr1 = zerobuff;
+ case 5:
+ inptr2 = zerobuff;
+ case 4:
+ inptr3 = zerobuff;
+ case 3:
+ inptr4 = zerobuff;
+ case 2:
+ inptr5 = zerobuff;
+ case 1:
+ inptr6 = zerobuff;
+ case 0:
+ inptr7 = zerobuff;
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+ }
+
+ __asm __volatile(
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "LDR q0, [%[inptr0]], #16\n"
+ "LDR q2, [%[inptr1]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+ "FCVTL2 v3.4s, v2.8h\n"
+ "FCVTL v2.4s, v2.4h\n"
+ "FCVTL2 v5.4s, v4.8h\n"
+ "FCVTL v4.4s, v4.4h\n"
+ "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+ "FCVTL2 v7.4s, v6.8h\n"
+ "FCVTL v6.4s, v6.4h\n"
+ "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
+ "LDR q8, [%[inptr4]], #16\n"
+ "LDR q10, [%[inptr5]], #16\n"
+ "FCVTL2 v9.4s, v8.8h\n"
+ "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[inptr1], #128]")
+ "LDR q12, [%[inptr6]], #16\n"
+ "FCVTL2 v11.4s, v10.8h\n"
+ "FCVTL v10.4s, v10.4h\n"
+ "FCVTL2 v13.4s, v12.8h\n"
+ "FCVTL v12.4s, v12.4h\n"
+ "ZIP1 v18.4s, v8.4s, v12.4s\n"
+ "LDR q14, [%[inptr7]], #16\n"
+ "FCVTL2 v15.4s, v14.8h\n"
+ "FCVTL v14.4s, v14.4h\n"
+ "ZIP1 v19.4s, v10.4s, v14.4s\n"
+
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr3], #128]")
+
+ "ZIP2 v16.4s, v0.4s, v4.4s\n"
+ "ZIP2 v17.4s, v2.4s, v6.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
+
+ "ZIP2 v18.4s, v8.4s, v12.4s\n" ASM_PREFETCH("[%[inptr4], #128]")
+ "ZIP2 v19.4s, v10.4s, v14.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n" ASM_PREFETCH("[%[inptr5], #128]")
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP1 v16.4s, v1.4s, v5.4s\n"
+ "ZIP1 v17.4s, v3.4s, v7.4s\n" ASM_PREFETCH("[%[inptr6], #128]")
+ "STP q20, q21, [%[outptr]], #32\n" // Third element
+
+ "ZIP1 v18.4s, v9.4s, v13.4s\n"
+ "ZIP1 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Fourth element
+ ASM_PREFETCH("[%[inptr7], #128]")
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+
+ "ZIP2 v16.4s, v1.4s, v5.4s\n"
+ "ZIP2 v17.4s, v3.4s, v7.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Fifth element
+
+ "ZIP2 v18.4s, v9.4s, v13.4s\n"
+ "ZIP2 v19.4s, v11.4s, v15.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Sixth element
+
+ "ZIP1 v20.4s, v16.4s, v17.4s\n"
+ "ZIP1 v21.4s, v18.4s, v19.4s\n"
+ "STP q20, q21, [%[outptr]], #32\n" // Seventh element
+
+ "ZIP2 v22.4s, v16.4s, v17.4s\n"
+ "ZIP2 v23.4s, v18.4s, v19.4s\n"
+ "STP q22, q23, [%[outptr]], #32\n" // Eighth element
+ : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
+ [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5), [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+ }
+
+ for(; x > 0; x--)
+ {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ *outptr++ = *inptr6++;
+ *outptr++ = *inptr7++;
+ }
+ }
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
new file mode 100644
index 0000000000..fd6a253c6a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include "transpose_interleave_common.hpp"
+
+// Generic unblocked transposed 6x32-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<6, 1, true, 4, 4>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a 12 x uint16_t specialisation
+ TransformImpl<12, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride * 2, x0 * 2, xmax * 2, k0, kmax);
+}
+
+// Generic 12x16-bit sized specialisation
+template <>
+template <typename T>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride, x0, xmax, k0, kmax);
+}
+
+// Specialised 12 x uint16_t version
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+ __asm volatile(
+ "LDR q0, [%[in0]]\n"
+ "STR q0, [%[out]]\n"
+ "LDR d1, [%[in0], #0x10]\n"
+ "STR d1, [%[out], #0x10]\n"
+ "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+ : [in0] "+r"(in0),
+ [out] "+r"(out)
+ :
+ : "v0", "v1", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+ __asm volatile(
+ "LDR q0, [%[in0]]\n"
+ "LDR d1, [%[in0], #0x10]\n"
+ "ADD %x[in0], %x[in0], #0x18\n" ASM_PREFETCH("[%[in0], #192]")
+
+ "LDR x21, [%[in1]]\n"
+ "LDR q2, [%[in1], #0x08]\n"
+ "INS v1.d[1], x21\n"
+ "ADD %x[in1], %x[in1], #0x18\n"
+ "STP q0, q1, [%[out]]\n"
+ "STR q2, [%x[out], #0x20]\n" ASM_PREFETCH("[%[in1], #192]")
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [out] "+r"(out)
+ :
+ : "x21", "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+ __asm __volatile(
+ "LDR q0, [%x[in0]], #0x10\n"
+ "STR q0, [%x[out]]\n"
+ "LDR d1, [%x[in0]], #0x08\n" ASM_PREFETCH("[%[in0], #192]")
+ "STR d1, [%x[out], #0x10]\n"
+
+ "LDR q0, [%x[in1]], #0x10\n"
+ "STR q0, [%x[out], #0x18]\n"
+ "LDR d1, [%x[in1]], #0x08\n" ASM_PREFETCH("[%[in1], #192]")
+ "STR d1, [%x[out], #0x28]\n"
+
+ "LDR q0, [%x[in2]], #0x10\n"
+ "STR q0, [%x[out], #0x30]\n"
+ "LDR d1, [%x[in2]], #0x08\n" ASM_PREFETCH("[%[in2], #192]")
+ "STR d1, [%x[out], #0x40]\n"
+
+ "LDR q0, [%x[in3]], #0x10\n"
+ "STR q0, [%x[out], #0x48]\n"
+ "LDR d1, [%x[in3]], #0x08\n" ASM_PREFETCH("[%[in3], #192]") "STR d1, [%x[out], #0x58]\n"
+ : [in0] "+r"(in0),
+ [in1] "+r"(in1),
+ [in2] "+r"(in2),
+ [in3] "+r"(in3),
+ [out] "+r"(out)
+ :
+ : "v0", "v1", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 2, 2>::Transform(
+ uint16_t *out, const uint16_t *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<12, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
new file mode 100644
index 0000000000..ff1cbfb5f5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
+
+#include "transpose_interleave_common.hpp"
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x1(const __fp16 *&in0, float *out)
+{
+ __asm __volatile(
+ "LDR q0, [%[in0]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDR d2, [%[in0]], #8\n"
+ "FCVTL v2.4s, v2.4h\n"
+ "STR q2, [%[out], #32]\n"
+ : [in0] "+r"(in0), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x2(const __fp16 *&in0, const __fp16 *&in1, float *out)
+{
+ __asm __volatile(
+ "LDR q0, [%[in0]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDR d2, [%[in0]], #8\n"
+ "FCVTL v2.4s, v2.4h\n"
+ "LDR q3, [%[in1]], #16\n"
+ "FCVTL2 v4.4s, v3.8h\n"
+ "FCVTL v3.4s, v3.4h\n"
+ "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+ "LDR d5, [%[in1]], #16\n"
+ "FCVTL v5.4s, v5.4h\n"
+ "STP q4, q5, [%[out], #64]\n"
+ : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
+}
+
+template <>
+inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __fp16 *&in0, const __fp16 *&in1, const __fp16 *&in2, const __fp16 *&in3, float *out)
+{
+ __asm __volatile(
+ "LDR q0, [%[in0]], #16\n"
+ "FCVTL2 v1.4s, v0.8h\n"
+ "FCVTL v0.4s, v0.4h\n"
+ "STP q0, q1, [%[out]]\n"
+ "LDR d2, [%[in0]], #8\n" ASM_PREFETCH("[%[in0], #192]")
+ "FCVTL v2.4s, v2.4h\n"
+ "LDR q3, [%[in1]], #16\n"
+ "FCVTL2 v4.4s, v3.8h\n"
+ "FCVTL v3.4s, v3.4h\n"
+ "STP q2, q3, [%[out], #32]\n"
+ "LDR d5, [%[in1]], #8\n"
+ "FCVTL v5.4s, v5.4h\n" ASM_PREFETCH("[%[in1], #192]")
+ "STP q4, q5, [%[out], #64]\n"
+ "LDR q6, [%[in2]], #16\n"
+ "FCVTL2 v7.4s, v6.8h\n"
+ "FCVTL v6.4s, v6.4h\n"
+ "STP q6, q7, [%[out], #96]\n"
+ "LDR d8, [%[in2]], #8\n"
+ "FCVTL v8.4s, v8.4h\n" ASM_PREFETCH("[%[in2], #192]")
+ "LDR q9, [%[in3]], #16\n"
+ "FCVTL2 v10.4s, v9.8h\n"
+ "FCVTL v9.4s, v9.4h\n"
+ "STP q8, q9, [%[out], #128]\n"
+ "LDR d11, [%[in3]], #8\n"
+ "FCVTL v11.4s, v11.4h\n"
+ "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
+
+ : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
+}
+
+template <>
+template <>
+inline void TransformImpl<12, 1, true, 4, 2>::Transform(
+ float *out, const __fp16 *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<12, __fp16, float>::Transform(out, in, stride, x0, xmax, k0, kmax);
+}
+
+#endif // __aarch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index b6565baa23..5434599f03 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -31,100 +31,91 @@
template <>
template <typename T>
inline void TransformImpl<12, 1, true, 4, 4>::Transform(
- T* out, const T* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- // Redirect to a 24 x uint16_t specialisation
- TransformImpl<24, 1, true, 2, 2>::Transform(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t * const>(in),
- stride*2, x0*2, xmax*2, k0, kmax
- );
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a 24 x uint16_t specialisation
+ TransformImpl<24, 1, true, 2, 2>::Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride * 2, x0 * 2, xmax * 2, k0, kmax);
}
// Generic 24x16-bit sized specialisation
template <>
template <typename T>
inline void TransformImpl<24, 1, true, 2, 2>::Transform(
- T* out, const T* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- // Redirect to a uint16_t specialisation
- Transform(
- reinterpret_cast<uint16_t *>(out),
- reinterpret_cast<const uint16_t * const>(in),
- stride, x0, xmax, k0, kmax
- );
+ T *out, const T *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ // Redirect to a uint16_t specialisation
+ Transform(
+ reinterpret_cast<uint16_t *>(out),
+ reinterpret_cast<const uint16_t *const>(in),
+ stride, x0, xmax, k0, kmax);
}
// Specialised 24 x uint16_t version
template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) {
- __asm __volatile (
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out)
+{
+ __asm __volatile(
"LDP q0, q1, [%[in0]], #32\n"
- "STP q0, q1, [%[out]]\n"
- ASM_PREFETCH("[%[in0], #192]")
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
"LDR q2, [%[in0]], #16\n"
"STR q2, [%[out], #32]\n"
- : [in0] "+r" (in0), [out] "+r" (out)
- :
- : "v0", "v1", "v2", "memory"
- );
+ : [in0] "+r"(in0), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "memory");
}
template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) {
- __asm __volatile (
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1, uint16_t *out)
+{
+ __asm __volatile(
"LDP q0, q1, [%[in0]], #32\n"
- "STP q0, q1, [%[out]]\n"
- ASM_PREFETCH("[%[in0], #192]")
+ "STP q0, q1, [%[out]]\n" ASM_PREFETCH("[%[in0], #192]")
"LDR q2, [%[in0]], #16\n"
- "LDP q3, q4, [%[in1]], #32\n"
- "STP q2, q3, [%[out], #32]\n"
- ASM_PREFETCH("[%[in1], #192]")
- "LDR q5, [%[in1]], #16\n"
+ "LDP q3, q4, [%[in1]], #32\n"
+ "STP q2, q3, [%[out], #32]\n" ASM_PREFETCH("[%[in1], #192]")
+ "LDR q5, [%[in1]], #16\n"
"STP q4, q5, [%[out], #64]\n"
- : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "memory"
- );
+ : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
}
template <>
-inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) {
- __asm __volatile (
+inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out)
+{
+ __asm __volatile(
"LDP q0, q1, [%[in0]], #32\n"
"STP q0, q1, [%[out]]\n"
- "LDR q2, [%[in0]], #16\n"
- ASM_PREFETCH("[%[in0], #192]")
- "LDP q3, q4, [%[in1]], #32\n"
+ "LDR q2, [%[in0]], #16\n" ASM_PREFETCH("[%[in0], #192]")
+ "LDP q3, q4, [%[in1]], #32\n"
"STP q2, q3, [%[out], #32]\n"
- "LDR q5, [%[in1]], #16\n"
- ASM_PREFETCH("[%[in1], #192]")
+ "LDR q5, [%[in1]], #16\n" ASM_PREFETCH("[%[in1], #192]")
"STP q4, q5, [%[out], #64]\n"
- "LDP q6, q7, [%[in2]], #32\n"
+ "LDP q6, q7, [%[in2]], #32\n"
"STP q6, q7, [%[out], #96]\n"
- "LDR q8, [%[in2]], #16\n"
- ASM_PREFETCH("[%[in2], #192]")
- "LDP q9, q10, [%[in3]], #32\n"
+ "LDR q8, [%[in2]], #16\n" ASM_PREFETCH("[%[in2], #192]")
+ "LDP q9, q10, [%[in3]], #32\n"
"STP q8, q9, [%[out], #128]\n"
- "LDR q11, [%[in3]], #16\n"
- "STP q10, q11, [%[out], #160]\n"
- ASM_PREFETCH("[%[in3], #192]")
+ "LDR q11, [%[in3]], #16\n"
+ "STP q10, q11, [%[out], #160]\n" ASM_PREFETCH("[%[in3], #192]")
- : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
- );
+ : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3), [out] "+r"(out)
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory");
}
template <>
template <>
inline void TransformImpl<24, 1, true, 2, 2>::Transform(
- uint16_t* out, const uint16_t* const in, const int stride,
- const int x0, const int xmax, const int k0, const int kmax
-) {
- TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
+ uint16_t *out, const uint16_t *const in, const int stride,
+ const int x0, const int xmax, const int k0, const int kmax)
+{
+ TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax);
}
-#endif // __arch64__
+#endif // __arch64__
diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 8ad5b857fb..8ad5b857fb 100644
--- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
new file mode 100644
index 0000000000..3218ca1aac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+template <unsigned int IntBy, typename TIn, typename TOut>
+struct TransposeInterleaveCommon
+{
+ // Override the moveblock_1xY methods to improve performance
+ static inline void moveblock_1x1(const TIn *&in0, TOut *out)
+ {
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ }
+
+ static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out)
+ {
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in1++);
+ }
+ }
+
+ static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out)
+ {
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in0++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in1++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in2++);
+ }
+ for(unsigned int i = 0; i < IntBy; i++)
+ {
+ *out++ = static_cast<TOut>(*in3++);
+ }
+ }
+
+ static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax)
+ {
+ const auto ldin = stride;
+
+ TOut *outarray = out;
+ const TIn *inarray = in;
+ TOut *outptr_base = outarray;
+ const TIn *inptr_base = inarray + x0 + (k0 * ldin);
+ int ldout = (kmax - k0) * IntBy;
+
+ int k = (kmax - k0);
+ for(; k > 3; k -= 4)
+ {
+ TOut *outptr = outptr_base;
+ const TIn *inptr = inptr_base;
+ const TIn *inptr1 = inptr + ldin;
+ const TIn *inptr2 = inptr1 + ldin;
+ const TIn *inptr3 = inptr2 + ldin;
+
+ prefetch_3x(inptr);
+ prefetch_3x(inptr1);
+ prefetch_3x(inptr2);
+ prefetch_3x(inptr3);
+
+ outptr_base += IntBy * 4;
+ inptr_base += ldin * 4;
+
+ for(int x = (xmax - x0) / IntBy; x > 0; x--)
+ {
+ moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr);
+ outptr += ldout;
+ }
+ }
+
+ if(k)
+ {
+ TOut *outptr = outptr_base;
+ const TIn *inptr = inptr_base;
+ const TIn *inptr1 = inptr + ldin;
+ const TIn *inptr2 = inptr1 + ldin;
+
+ prefetch_3x(inptr);
+ prefetch_3x(inptr1);
+ prefetch_3x(inptr2);
+
+ for(int x = (xmax - x0) / IntBy; x > 0; x--)
+ {
+ switch(k)
+ {
+ case 3:
+ moveblock_1x2(inptr, inptr1, outptr);
+ moveblock_1x1(inptr2, outptr + IntBy * 2);
+ break;
+
+ case 2:
+ moveblock_1x2(inptr, inptr1, outptr);
+ break;
+
+ case 1:
+ moveblock_1x1(inptr, outptr);
+ break;
+
+ default:
+ UNREACHABLE("Impossible.");
+ }
+
+ outptr += ldout;
+ }
+ }
+
+ // Cope with ragged X cases
+ const unsigned int overflow = (xmax - x0) % IntBy;
+ if(overflow)
+ {
+ const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin);
+ TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout;
+
+ for(int k = (kmax - k0); k > 0; k--)
+ {
+ const TIn *inptr = inptr_base;
+ inptr_base += ldin;
+
+ for(unsigned int x = 0; x < IntBy; x++)
+ {
+ TOut val = (x < overflow) ? static_cast<TOut>(*inptr++) : static_cast<TOut>(0);
+ *outptr++ = val;
+ }
+ }
+ }
+ }
+};
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 5252378db7..6c5b92ae8f 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -21,28 +21,31 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
-#define __ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#pragma once
-namespace arm_compute
+// Macro for unreachable code (e.g. impossible default cases on switch)
+#define UNREACHABLE(why) __builtin_unreachable()
+
+// Paranoid option for the above with assert
+// #define UNREACHABLE(why) assert(0 && why)
+
+inline int iceildiv(const int a, const int b)
{
-class ITensor;
+ return (a + b - 1) / b;
+}
-/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMAArch64Kernel : public NEGEMMAssemblyBaseKernel
+template <typename T>
+inline T roundup(const T a, const T b)
{
-public:
- const char *name() const override
+ T rem = a % b;
+
+ if(rem)
{
- return "NEGEMMAArch64Kernel";
+ return a + b - rem;
}
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
+ else
+ {
+ return a;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 05907bab07..c8cba8a174 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -26,37 +26,20 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMVAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/AssemblyHelper.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch-default"
-#pragma GCC diagnostic ignored "-Weffc++"
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/gemv_transposed.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemv_trans.hpp"
-#pragma GCC diagnostic pop
-} // namespace arm_compute
-
#include <cmath>
namespace arm_compute
{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
_run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
{
}
@@ -82,42 +65,13 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ const bool run_optimised = setup_assembly_kernel(a, b, c, d, alpha, beta, _workspace, _memory_group, _asm_glue);
// Check if the first input tensor is a vector.
// If so, all the kernels for reshaping the tensors can be skipped
if(_run_vector_matrix_multiplication)
{
-#if defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMVAArch64Kernel>();
- }
-
- if(_mm_optimised_kernel != nullptr)
- {
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- size_t workbench_size = 0;
-
- if(a->info()->data_type() == DataType::F32)
- {
- workbench_size = GemvTransposed<sgemv_trans, sgemv_trans::operand_type, sgemv_trans::result_type>(&ci, N, K).get_working_size();
- }
-
- constexpr size_t alignment = 4096;
- ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* defined(__aarch64__) */
+ if(!run_optimised)
{
// Configure the matrix multiply kernel
_mm_kernel.configure(a, b, d, alpha, false);
@@ -132,65 +86,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
}
else
{
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
- else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
-#if defined(__arm__) || defined(__aarch64__)
- if(_mm_optimised_kernel != nullptr)
- {
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int M = d->info()->tensor_shape().y();
- const int N = d->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- size_t workbench_size = 0;
-
-#if defined(__arm__)
- workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
-#elif defined(__aarch64__)
- if(a->info()->data_type() == DataType::F32)
- {
- workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
- }
- else if(a->info()->data_type() == DataType::F16)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
- _workspace.allocator()->allocate();
- }
- else
-#endif /* defined(__arm__) || defined(__aarch64__) */
+ if(!run_optimised)
{
TensorShape shape_tmp_a = a->info()->tensor_shape();
TensorShape shape_tmp_b = b->info()->tensor_shape();
@@ -243,9 +139,9 @@ void NEGEMM::run()
{
_memory_group.acquire();
- if(_mm_optimised_kernel != nullptr)
+ if(_asm_glue._optimised_kernel != nullptr)
{
- NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _asm_glue.run();
_memory_group.release();
}
else
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a85078cf71..3b8b4243e5 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -23,9 +23,6 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64NativeKernel.h"
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
@@ -34,13 +31,6 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
-} // namespace arm_compute
-
#include <cmath>
#include <tuple>
@@ -226,8 +216,8 @@ Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInf
} // namespace
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _mm_gemmlowp(memory_manager),
- _gemmlowp_output_stage(), _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
+ : _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+ _output_col2im_kernel(), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _tmp_output(), _workspace(), _append_bias(false),
_is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false), _is_interleaved(false)
{
}
@@ -256,25 +246,6 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
}
}
-void NEGEMMConvolutionLayer::configure_asm_mm(const struct CPUInfo &ci, int M, int N, int K)
-{
- ARM_COMPUTE_UNUSED(ci);
- ARM_COMPUTE_UNUSED(M);
- ARM_COMPUTE_UNUSED(N);
- ARM_COMPUTE_UNUSED(K);
-#if defined(__arm__) || defined(__aarch64__)
-#if defined(__arm__)
- GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
-#elif defined(__aarch64__)
- GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-#endif /* defined(__arm__) || defined(__aarch64__) */
-}
-
void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
{
// Perform validate step
@@ -298,20 +269,11 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
const unsigned int fixed_point_position = input->info()->fixed_point_position();
const ITensor *biases_to_use = (_append_bias) ? biases : nullptr;
-#if defined(__arm__)
- if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch32Kernel>();
- }
-#elif defined(__aarch64__)
- if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
- }
-#endif /* defined(__arm__) || defined(__aarch64__) */
+ bool run_optimised =
+ (NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32) || (NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32);
// Reshape weights if needed
- if(_mm_optimised_kernel != nullptr)
+ if(run_optimised)
{
if(_are_weights_reshaped)
{
@@ -378,7 +340,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
_memory_group.manage(&_input_im2col_reshaped);
// Create tensor (interleave) to prepare input tensor for GEMM
- if(!_is_fully_connected_convolution && _mm_optimised_kernel == nullptr)
+ if(!_is_fully_connected_convolution && !run_optimised)
{
TensorShape shape_interleaved(shape_im2col);
shape_interleaved.set(0, shape_interleaved.x() * 4);
@@ -403,29 +365,10 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
_input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
// Configure matrix multiply
- if(_mm_optimised_kernel != nullptr)
+ if(run_optimised)
{
- struct CPUInfo ci = NEScheduler::get().cpu_info();
-
- const int M = _gemm_output.info()->tensor_shape().y();
- const int N = _gemm_output.info()->tensor_shape().x();
- const int K = _input_im2col_reshaped.info()->tensor_shape().x();
-
-#if defined(__aarch64__)
- if((N <= 128) && (K <= 128))
- {
- _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64NativeKernel>();
- }
- else
-#endif /* defined(__aarch64__) */
- {
- configure_asm_mm(ci, M, N, K);
- }
-
- // Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-
- _workspace.allocator()->allocate();
+ run_optimised = setup_assembly_kernel(&_input_im2col_reshaped, weights, nullptr, &_gemm_output, 1.f, 0.f, _workspace, _memory_group, _asm_glue);
+ ARM_COMPUTE_ERROR_ON_MSG(run_optimised == false, "setup_assembly_kernel failed.");
}
else
{
@@ -615,9 +558,9 @@ void NEGEMMConvolutionLayer::run()
NEScheduler::get().schedule(&_input_im2col_kernel, Window::DimY);
// Runs matrix multiply on reshaped matrices
- if(_mm_optimised_kernel != nullptr)
+ if(_asm_glue._optimised_kernel != nullptr)
{
- NEScheduler::get().schedule(_mm_optimised_kernel.get(), Window::DimY);
+ _asm_glue.run();
}
else
{
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 9b36e81afd..e5e97910d8 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
@@ -1,4 +1,4 @@
-/* Copyright (c) 2017 ARM Limited.
+/* Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,20 +35,11 @@
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
-} // namespace arm_compute
-
using namespace arm_compute;
NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+ : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(),
+ _workspace()
{
}
@@ -65,89 +52,28 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITe
ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+ bool run_optimised = false;
#ifdef __aarch64__
- const int M = output->info()->tensor_shape().y();
- const int N = output->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
- constexpr size_t workspace_alignment = 4096;
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
-#endif /* __aarch64__ */
-
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else
-#elif defined(ARM_COMPUTE_AARCH64_V8A)
- if(ci.CPU == CPUTarget::A53)
+ switch(a->info()->data_type())
{
- switch(a->info()->data_type())
+ case DataType::S8:
{
- case DataType::S8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
+ run_optimised = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
break;
- case DataType::U8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
}
-
- _memory_group.manage(&_workspace);
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
- }
- else if(1) // Generic v8a kernel
- {
- switch(a->info()->data_type())
+ case DataType::U8:
{
- case DataType::S8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
+ run_optimised = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
break;
- case DataType::U8:
- {
- // Configure matrix multiply kernel
- GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
- _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- }
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
break;
- default:
- ARM_COMPUTE_ERROR("Datatype not supported");
}
- _memory_group.manage(&_workspace);
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
- _mm_kernel = std::move(k);
- _workspace.allocator()->allocate();
}
- else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __aarch64__ */
+ if(!run_optimised)
{
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
TensorShape shape_tmp_a = a->info()->tensor_shape();
@@ -206,7 +132,18 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.run();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index ad47593f20..dc4ed5cefb 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -26,11 +26,9 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -39,18 +37,13 @@
#include "arm_compute/runtime/TensorAllocator.h"
#include "support/ToolchainSupport.h"
-namespace arm_compute
-{
-#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
-#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
-} // namespace arm_compute
-
using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+ : _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false),
+ _dot_product_path(false)
{
}
@@ -64,33 +57,27 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_b_offset = b->info()->quantization_info().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- // Check for DOT product instruction
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
- if(cpu_has_dotprod != 0)
+#ifdef __aarch64__
+ switch(a->info()->data_type())
{
- _dot_product_path = true;
-
- // Configure matrix multiply kernel
- struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int M = output->info()->tensor_shape().y();
- const int N = output->info()->tensor_shape().x();
- const int K = a->info()->tensor_shape().x();
-
- const size_t workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
- constexpr size_t alignment = 4096;
- _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
- _memory_group.manage(&_workspace);
-
- // Configure matrix multiplication kernel
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
- _mm_kernel = std::move(k);
+ case DataType::S8:
+ {
+ _dot_product_path = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_signed);
+ break;
+ }
+ case DataType::U8:
+ {
+ _dot_product_path = setup_assembly_kernel(a, b, nullptr, output, 1.f, 1.f, _workspace, _memory_group, _asm_glue_unsigned);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Datatype not supported");
+ break;
+ }
}
- else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+#endif /* __aarch64__ */
+ if(!_dot_product_path)
{
if(_run_vector_matrix_multiplication)
{
@@ -203,42 +190,28 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
int32_t b_offset = b->quantization_info().offset;
bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-#ifdef ARM_COMPUTE_AARCH64_V8_2
- // Check for DOT product instruction
- const struct CPUInfo ci = NEScheduler::get().cpu_info();
- const int cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
-
- if(cpu_has_dotprod != 0)
+ if(!run_vector_matrix_multiplication)
{
- // Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->tensor_shape();
+ shape_tmp_a.set(0, a->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->tensor_shape();
+ shape_tmp_b.set(0, b->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
}
else
-#endif /* ARM_COMPUTE_AARCH64_V8_2 */
{
- if(!run_vector_matrix_multiplication)
- {
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
- }
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
}
TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -288,7 +261,18 @@ void NEGEMMLowpMatrixMultiplyCore::run()
}
}
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.run();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.run();
+ }
+ else
+ {
+ NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+ }
// Run matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index a901b442ab..471ec3faaf 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
-#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"