aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/NEON/kernels
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2017-11-02 16:09:35 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit6ff12a0f7765f62b8d0fa8554021e1cac2789f19 (patch)
tree9338db697789106b49ea391634be8b3c08ef9f97 /arm_compute/core/NEON/kernels
parent97988a4b3ef0f840432daf95b6e4b2ad7e5feefd (diff)
downloadComputeLibrary-6ff12a0f7765f62b8d0fa8554021e1cac2789f19.tar.gz
COMPMID-662: Integrated the new a64_s8_gemm_12x8 + dot product kernel into ACL.
Change-Id: Id8f919e486a132fc58346c9f84fccbeeb83d19b3 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94233 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'arm_compute/core/NEON/kernels')
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpAssemblyBaseKernel.h78
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h8
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h6
-rw-r--r--arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp13
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp61
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp367
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h66
-rw-r--r--arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp363
-rw-r--r--arm_compute/core/NEON/kernels/assembly/profiler.hpp18
12 files changed, 886 insertions, 100 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
index e298bfdebd..9e0fe8059b 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
@@ -30,7 +30,7 @@ namespace arm_compute
{
class ITensor;
-/** AssemblyBase/armv7a NEON kernel to multiply two input matrices "A" and "B". */
+/** Base class for GEMM NEON kernels implemented in Assembly. */
class NEGEMMAssemblyBaseKernel : public INEKernel
{
public:
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpAssemblyBaseKernel.h
deleted file mode 100644
index 32105ad6d4..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpAssemblyBaseKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMLOWPASSEMBLYBASE_H__
-#define __ARM_COMPUTE_NEGEMMLOWPASSEMBLYBASE_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** GEMMLOWP AssemblyBase NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAssemblyBaseKernel : public INEKernel
-{
-public:
- /** Constructor */
- NEGEMMLowpAssemblyBaseKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _transform_0(true), _transform_1(true)
- {
- }
-
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpAssemblyBaseKernel(const NEGEMMLowpAssemblyBaseKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMLowpAssemblyBaseKernel &operator=(const NEGEMMLowpAssemblyBaseKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMLowpAssemblyBaseKernel(NEGEMMLowpAssemblyBaseKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMLowpAssemblyBaseKernel &operator=(NEGEMMLowpAssemblyBaseKernel &&) = default;
-
- virtual ~NEGEMMLowpAssemblyBaseKernel() = default;
-
- /** Initialise the kernel's input and output.
- *
- * The computed function is C = a * AxB + b * C.
- *
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32
- * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0
- * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
- */
- void configure(const ITensor *input0, const ITensor *input1, ITensor *output)
- {
- internal_configure(input0, input1, output);
- }
-
-protected:
- virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output) = 0;
-
- const ITensor *_input0;
- const ITensor *_input1;
- ITensor *_output;
- ITensor *_workspace;
- bool _transform_0;
- bool _transform_1;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMLOWPASSEMBLYBASE_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h
index 77b2bdc177..8908fabc1e 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpFinalizeKernel.h
@@ -62,7 +62,7 @@ public:
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of input0.
* Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p vector_sum_col
* @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: same as @p vector_sum_col
- * @param[out] output Output tensor containing the result of GEMMLowP. Data type supported: U8
+ * @param[out] output Output tensor containing the result of GEMMLowP. Data type supported: S8
* @param[in] num_mtx_a_cols Number of matrix A columns
* @param[in] a_offset Offset to be added to each element of the matrix A.
* @param[in] b_offset Offset to be added to each element of the matrix B.
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index 670274b8f3..f145eb6ca3 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -35,8 +35,8 @@ class ITensor;
* @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
* This kernel performs the following computation:
*
- * -# Convert a values from uint8 to int32
- * -# Convert b values from uint8 to int32
+ * -# Convert a values from int8 to int32
+ * -# Convert b values from int8 to int32
* -# Compute the int32 matrix product of the resulting a * b and store the result as int32
*
*/
@@ -58,7 +58,7 @@ public:
* The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
* kernels change the layout of the original matrices to be more cache-friendly.
*
- * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8
+ * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: S8
* @param[in] input1 Input tensor containing the transposed Matrix B. Data type supported: same as @p input0
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
*/
@@ -74,4 +74,4 @@ private:
bool _slide_matrix_b;
};
} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/ \ No newline at end of file
+#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 143e8b917b..a069969681 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -48,7 +48,7 @@ public:
public:
/** Initialise the kernel's input and output.
*
- * @param[in] input Input tensor containing the interleaved or transposed matrix. Data type supported: U8
+ * @param[in] input Input tensor containing the interleaved or transposed matrix. Data type supported: S8
* @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
* @param[in] k Number of matrix A columns (or matrix B rows)
* @param[in] is_reshaped True if the input tensor has been reshaped
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
index 4eab7f91fc..32779891db 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
@@ -24,7 +24,7 @@
#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
// Enable only if compiled for AArch64-V8.2-A targets
#ifdef ARM_COMPUTE_AARCH64_V8_2
@@ -34,7 +34,7 @@ namespace arm_compute
class ITensor;
/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */
-class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMLowpAssemblyBaseKernel
+class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel
{
public:
// Inherited methods overridden:
@@ -42,7 +42,7 @@ public:
bool is_parallelisable() const override;
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_AARCH64_V8_2 */
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
index a186d88355..659ef837f5 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp
@@ -24,6 +24,7 @@
#pragma once
#include <stdio.h>
+#include <cassert>
#include "gemm_common.hpp"
#include "profiler.hpp"
@@ -114,12 +115,13 @@ public:
// Work out the rounded size of M - needed for some buffers.
Mround = (M + (strat.out_height - 1)) / strat.out_height;
Mround *= strat.out_height;
+
}
// Actually execute the GEMM.
void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override {
+ assert(working_space);
profiler prof;
-
int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes);
size_t diff = 0;
@@ -128,7 +130,6 @@ public:
diff = 0x10 - (working_space_int & 0xF);
}
- // TODO: Multithreaded implementations could share the burden of transforming these blocks.
Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + diff);
Toi * const b_panel = reinterpret_cast<Toi *>(working_space_bytes + get_a_working_size() + diff);
Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + get_a_working_size() + get_b_working_size() + diff);
@@ -141,7 +142,7 @@ public:
int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll;
kern_k *= strat.k_unroll;
- prof(PROFILE_PREPA, [&](void) {
+ prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) {
if (trA ^ strategy::A_transpose) {
Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax);
} else {
@@ -155,7 +156,7 @@ public:
int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width;
- prof(PROFILE_PREPB, [&](void) {
+ prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) {
if (trB ^ strategy::B_transpose) {
Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax);
} else {
@@ -167,8 +168,8 @@ public:
unsigned int ymax = y + strat.out_height;
if (ymax > M) ymax = M;
- prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
- prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
+ prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); });
+ prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); });
}
}
}
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
new file mode 100644
index 0000000000..88cbb361b3
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+// Load the actual kernel
+#include "a64_gemm_s8_12x8/generic.hpp"
+
+class gemm_s8_12x8 {
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+ /* Describes the data layout for A input */
+ static const int A_interleave = 8;
+ static const int A_block = 4;
+ static const bool A_transpose = false;
+
+ /* Same for B input */
+ static const int B_interleave = 12;
+ static const int B_block = 4;
+ static const bool B_transpose = true;
+
+ /* Kernel blocking parameters */
+ static const int out_width = 12;
+ static const int out_height = 8;
+ static const int k_unroll = 4;
+
+ kern_type kernel = nullptr;
+
+ gemm_s8_12x8(const CPUInfo *ci) {
+ kernel = a64_gemm_s8_12x8;
+ }
+};
+
+#endif // __aarch64__
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
new file mode 100644
index 0000000000..5ed930c0b0
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "dot_toolchain_support.h"
+#include <cassert>
+
+void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ assert(Apanel);
+ assert(Bpanel);
+ assert(Cpanel);
+ K/=4;
+ const long int row_jump=0;
+ const long int block_jump=0;
+ const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel);
+ int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel);
+ for (int yb=0; yb<ablocks; yb++) {
+ const int32_t *a_ptr0 = a_ptr;
+ const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel);
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K+1)/2) - 1;
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+
+ __asm __volatile (
+
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldp %q[a0], %q[a1], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldp %q[b0], %q[b1], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ _DECLARE_SDOT
+
+ // Loop proper
+ "1:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %d[b2], [%[b_ptr], #32]\n"
+
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr x20, [%[b_ptr], #40]\n"
+
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+ "add %[b_ptr], %[b_ptr], %[block_jump]\n"
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+
+
+ ".purgem sdot\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+ );
+
+
+ }
+ }
+
+}
+
+#endif
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
new file mode 100644
index 0000000000..1d6fd1623e
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Define a macro to assemble the UDOT instruction (in the absence of toolchain support)
+#define _DECLARE_SDOT ".altmacro\n"\
+ ".macro sdot opd:req, opn:req, opm:req\n"\
+ "local vd, vn, vm, h, l\n"\
+ ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\
+ ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\
+ ".set vd,\\reg\n"\
+ ".endif\n"\
+ ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\
+ ".set vn,\\reg\n"\
+ ".endif\n"\
+ ".irp idx,0,1,2,3\n"\
+ ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\
+ ".set vm,\\reg\n"\
+ ".set h,\\idx / 2\n"\
+ ".set l,\\idx %% 2\n"\
+ ".endif\n"\
+ ".endr\n"\
+ ".endr\n"\
+ ".ifndef vd\n"\
+ ".error \"Bad operand \\opd\"\n"\
+ ".exitm\n"\
+ ".endif\n"\
+ ".ifndef vn\n"\
+ ".error \"Bad operand \\opn\"\n"\
+ ".exitm\n"\
+ ".endif\n"\
+ ".ifndef vm\n"\
+ ".error \"Bad operand \\opm\"\n"\
+ ".exitm\n"\
+ ".endif\n"\
+ ".ifndef h\n"\
+ ".error \"Bad operand \\opm\"\n"\
+ ".exitm\n"\
+ ".endif\n"\
+ ".ifndef l\n"\
+ ".error \"Bad operand \\opm\"\n"\
+ ".exitm\n"\
+ ".endif\n"\
+ ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\
+ ".endm\n"\
+
diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
new file mode 100644
index 0000000000..bfad0373b2
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+#include "dot_toolchain_support.h"
+#include <cassert>
+
+
+inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+ assert(Apanel);
+ assert(Bpanel);
+ assert(Cpanel);
+ K/=4;
+ const long int row_jump=0;
+ const long int block_jump=0;
+ const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel);
+ int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel);
+ for (int yb=0; yb<ablocks; yb++) {
+ const int32_t *a_ptr0 = a_ptr;
+ const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel);
+ for (int xb=0; xb<bblocks; xb++) {
+ a_ptr = a_ptr0;
+ // Fix up for odd lengths - set a flag if K is odd, but make
+ // sure we round up the iteration count.
+ int oddk = (K & 1);
+ int k = ((K+1)/2) - 1;
+ register int32x4_t a0 asm("v0");
+ register int32x4_t a1 asm("v1");
+ register int32x4_t b0 asm("v2");
+ register int32x4_t b1 asm("v3");
+ register int32x4_t b2 asm("v4");
+ register int32x4_t a0a asm("v5");
+ register int32x4_t a1a asm("v6");
+ __asm __volatile (
+ // Initialize result registers, load initial operands, prime prefetches.
+ "movi v8.4s, #0x0\n"
+ "ldr %q[a0], [%[a_ptr]]\n"
+ "movi v9.4s, #0x0\n"
+ "ldr %q[b0], [%[b_ptr]]\n"
+ "movi v10.4s, #0x0\n"
+ "ldr %q[a1], [%[a_ptr], #16]\n"
+ "movi v11.4s, #0x0\n"
+ "ldr %q[b1], [%[b_ptr], #16]\n"
+ "movi v12.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #64]")
+ "movi v13.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #64]")
+ "movi v14.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #128]")
+ "movi v15.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #128]")
+ "movi v16.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #192]")
+ "movi v17.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #256]")
+ "movi v18.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #192]")
+ "movi v19.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #320]")
+ "movi v20.4s, #0x0\n"
+ ASM_PREFETCH("[%[a_ptr], #256]")
+ "movi v21.4s, #0x0\n"
+ ASM_PREFETCH("[%[b_ptr], #384]")
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+
+ // Skip loop if we are doing zero iterations of it.
+ "cbz %w[k], 4f\n"
+
+ _DECLARE_SDOT
+
+ // Loop proper
+ "1:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[a_ptr], #320]")
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #448]")
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "ldr %q[a0], [%[a_ptr], #64]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "ldr %q[a1], [%[a_ptr], #80]\n"
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #96]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ ASM_PREFETCH("[%[b_ptr], #512]")
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #112]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "subs %w[k], %w[k], #1\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "bne 1b\n"
+
+ // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+ "4:\n"
+
+ // Branch to alternative tail for odd K
+ "cbnz %w[oddk], 2f\n"
+
+ // Detached final iteration (even K)
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "ldr %q[a0a], [%[a_ptr], #32]\n"
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "ldr %q[a1a], [%[a_ptr], #48]\n"
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "ldr %q[b0], [%[b_ptr], #48]\n"
+
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "ldr %q[b1], [%[b_ptr], #64]\n"
+
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "add %[a_ptr], %[a_ptr], #64\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "ldr %q[b2], [%[b_ptr], #80]\n"
+
+ "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n"
+
+ "add %[b_ptr], %[b_ptr], %[block_jump]\n"
+ "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #96\n"
+ "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n"
+ "str q24, [%[c_ptr], #32]\n"
+
+ "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+ "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+ "b 3f\n"
+
+ // Detached final iteration (odd K)
+ "2:\n"
+ "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+ "ldr %q[b2], [%[b_ptr], #32]\n"
+ "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], %[row_jump]\n"
+ "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+ "str q8, [%[c_ptr], #0]\n"
+ "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+ "str q16, [%[c_ptr], #16]\n"
+ "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+ "add %[b_ptr], %[b_ptr], #48\n"
+ "add %[a_ptr], %[a_ptr], #32\n"
+ "str q24, [%[c_ptr], #32]\n"
+ "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+ "str q9, [%[c_ptr], #48]\n"
+
+ "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+ "str q17, [%[c_ptr], #64]\n"
+ "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+ "str q25, [%[c_ptr], #80]\n"
+ "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+ "str q10, [%[c_ptr], #96]\n"
+
+ "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+ "str q18, [%[c_ptr], #112]\n"
+ "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+ "str q26, [%[c_ptr], #128]\n"
+ "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+ "str q11, [%[c_ptr], #144]\n"
+
+ "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+ "str q19, [%[c_ptr], #160]\n"
+ "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+ "str q27, [%[c_ptr], #176]\n"
+ "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+ "str q12, [%[c_ptr], #192]\n"
+
+ "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+ "str q20, [%[c_ptr], #208]\n"
+ "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+ "str q28, [%[c_ptr], #224]\n"
+ "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+ "str q13, [%[c_ptr], #240]\n"
+
+ "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+ "str q21, [%[c_ptr], #256]\n"
+ "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+ "str q29, [%[c_ptr], #272]\n"
+ "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+ "str q14, [%[c_ptr], #288]\n"
+
+ "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+ "str q22, [%[c_ptr], #304]\n"
+ "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+ "str q30, [%[c_ptr], #320]\n"
+ "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+ "str q15, [%[c_ptr], #336]\n"
+
+
+ // Common tail
+ "3:\n"
+ "str q23, [%[c_ptr], #352]\n"
+ "str q31, [%[c_ptr], #368]\n"
+ "add %[c_ptr], %[c_ptr], #384\n"
+
+ ".purgem sdot\n"
+ :
+ [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+ [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a),
+ [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+ : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump)
+ : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+ );
+ }
+ }
+
+
+}
+
+
+#endif
diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
index d2f8ba923a..f7a1d1c70c 100644
--- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp
@@ -31,6 +31,7 @@ class profiler {
private:
static const int maxevents = 10000;
unsigned long times[maxevents];
+ unsigned long units[maxevents];
int events[maxevents];
int currentevent;
int countfd;
@@ -45,35 +46,38 @@ public:
close(countfd);
int tots[5];
unsigned long counts[5];
+ unsigned long tunits[5];
const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" };
for (int i=1; i<5; i++) {
tots[i] = 0;
counts[i] = 0;
+ tunits[i] = 0;
}
printf("Profiled events:\n");
for (int i=0; i<currentevent; i++) {
- printf("%10s: %ld\n", descs[events[i]-1], times[i]);
tots[events[i]]++;
counts[events[i]] += times[i];
+ tunits[events[i]] += units[i];
}
- printf("%20s %9s %9s %9s\n", "", "Events", "Total", "Average");
+ printf("%20s %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle");
for (int i=1; i<5; i++) {
- printf("%20s: %9d %9ld %9ld\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i]);
+ printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]);
}
}
template <typename T>
- void operator() (int i, T func) {
+ void operator() (int i, unsigned long u, T func) {
if (currentevent==maxevents) {
func();
} else {
+ events[currentevent] = i;
+ units[currentevent] = u;
start_counter(countfd);
func();
long long cycs = stop_counter(countfd);
- events[currentevent] = i;
times[currentevent++] = cycs;
}
}
@@ -84,7 +88,7 @@ public:
class profiler {
public:
template <typename T>
- void operator() (int i, T func) {
+ void operator() (int i, unsigned long u, T func) {
func();
}
};
@@ -95,3 +99,5 @@ public:
#define PROFILE_PREPB 2
#define PROFILE_KERNEL 3
#define PROFILE_MERGE 4
+
+