aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h28
-rw-r--r--arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h5
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h5
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h2
-rw-r--r--src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp20
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp20
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp31
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp29
-rw-r--r--src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp21
-rw-r--r--src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp21
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp9
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp2
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp2
16 files changed, 101 insertions, 100 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
index 9e0fe8059b..1090dd5b0a 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
@@ -36,7 +36,7 @@ class NEGEMMAssemblyBaseKernel : public INEKernel
public:
/** Constructor */
NEGEMMAssemblyBaseKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _transform_0(true), _transform_1(true)
+ : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
{
}
@@ -55,22 +55,22 @@ public:
*
* The computed function is C = a * AxB + b * C.
*
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32
- * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0
- * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
- * @param[out] workspace Space for intermediate results.
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the accumulation.
- * @param[in] transform_0 If true the kernel will transform @p input0 prior to the multiplication.
- * @param[in] transform_1 If true the kernel will transform @p input1 prior to the multiplication.
+ * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32
+ * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0
+ * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
+ * @param[out] workspace Space for intermediate results.
+ * @param[in] alpha Weight of the matrix product
+ * @param[in] beta Weight of the accumulation.
+ * @param[in] is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
+ * @param[in] is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
*/
- void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool transform_0 = true, bool transform_1 = true)
+ void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
{
- internal_configure(input0, input1, output, workspace, alpha, beta, transform_0, transform_1);
+ internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
}
protected:
- virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) = 0;
+ virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
const ITensor *_input0;
const ITensor *_input1;
@@ -78,8 +78,8 @@ protected:
ITensor *_workspace;
float _alpha;
float _beta;
- bool _transform_0;
- bool _transform_1;
+ bool _is_transposed_0;
+ bool _is_transposed_1;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
index 597acca439..7564f6a0e1 100644
--- a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h
@@ -38,7 +38,7 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
index 77431d2bc8..5c29a825c2 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h
@@ -38,7 +38,7 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
index 33cd2d42d0..8e9783720e 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h
@@ -43,10 +43,11 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
private:
- using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+ using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
+ const Window &window,
const ThreadInfo &info);
NEGEMMLowpAArch64A53 *_func;
};
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
index a93df033de..3829d5e1d6 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h
@@ -44,10 +44,11 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
private:
- using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+ using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+ bool is_transposed_1, const Window &window,
const ThreadInfo &info);
NEGEMMLowpAArch64 *_func;
};
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
index b03e5fa1a2..b94499392e 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
@@ -51,7 +51,7 @@ public:
static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_AARCH64_V8_2 */
diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
index 9480a6a5d0..5671d99b85 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h
@@ -38,7 +38,7 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
protected:
- void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
+ void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/
diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
index ad0743b50f..bffcbbf436 100644
--- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
+++ b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
@@ -50,20 +50,20 @@ namespace arm_compute
namespace arm_compute
{
-void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _transform_0 = transform_0;
- _transform_1 = transform_1;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _is_transposed_0 = is_transposed_0;
+ _is_transposed_1 = is_transposed_1;
// Configure kernel window
Window win = calculate_max_window(*output->info());
@@ -104,7 +104,7 @@ void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info)
Iterator in0(_input0, window);
Iterator out(_output, window);
- GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
void *workspace = _workspace->buffer() + offset;
diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
index d70524b6b8..0eaa9aa39b 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
@@ -50,20 +50,20 @@ namespace arm_compute
namespace arm_compute
{
-void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _transform_0 = transform_0;
- _transform_1 = transform_1;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _is_transposed_0 = is_transposed_0;
+ _is_transposed_1 = is_transposed_1;
// Configure kernel window
Window win = calculate_max_window(*output->info());
@@ -104,7 +104,7 @@ void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info)
Iterator in0(_input0, window);
Iterator out(_output, window);
- GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
void *workspace = _workspace->buffer() + offset;
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
index e020cd9118..80606dcc07 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
@@ -56,7 +56,8 @@ NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel()
{
}
-void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
+ const Window &window,
const ThreadInfo &info)
{
const int lda = input0->info()->strides_in_bytes().y();
@@ -77,7 +78,7 @@ void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITe
Iterator in0(input0, window);
Iterator out(output, window);
- GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+ GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -99,7 +100,8 @@ void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITe
in0, out);
}
-void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
+ const Window &window,
const ThreadInfo &info)
{
const int lda = input0->info()->strides_in_bytes().y();
@@ -120,7 +122,7 @@ void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITe
Iterator in0(input0, window);
Iterator out(output, window);
- GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+ GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -142,20 +144,21 @@ void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITe
in0, out);
}
-void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+ bool is_transposed_1)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _transform_0 = transform_0;
- _transform_1 = transform_1;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _is_transposed_0 = is_transposed_0;
+ _is_transposed_1 = is_transposed_1;
switch(input0->info()->data_type())
{
@@ -192,7 +195,7 @@ void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &inf
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info);
+ (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
}
} // namespace arm_compute
#endif /* ARM_COMPUTE_AARCH64_V8A */
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
index db37201687..38f82f0407 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
@@ -56,7 +56,7 @@ NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel()
{
}
-void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
const ThreadInfo &info)
{
const int lda = input0->info()->strides_in_bytes().y();
@@ -77,7 +77,7 @@ void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *
Iterator in0(input0, window);
Iterator out(output, window);
- GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+ GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -99,7 +99,7 @@ void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *
in0, out);
}
-void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
const ThreadInfo &info)
{
const int lda = input0->info()->strides_in_bytes().y();
@@ -120,7 +120,7 @@ void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *
Iterator in0(input0, window);
Iterator out(output, window);
- GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+ GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -142,20 +142,21 @@ void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *
in0, out);
}
-void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+ bool is_transposed_1)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _transform_0 = transform_0;
- _transform_1 = transform_1;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _is_transposed_0 = is_transposed_0;
+ _is_transposed_1 = is_transposed_1;
switch(input0->info()->data_type())
{
@@ -192,7 +193,7 @@ void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info);
+ (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
}
} // namespace arm_compute
#endif /* ARM_COMPUTE_AARCH64_V8A */
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
index e996e571ab..099934a49d 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
@@ -84,20 +84,21 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
namespace arm_compute
{
-void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+ bool is_transposed_1)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _transform_0 = transform_0;
- _transform_1 = transform_1;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _is_transposed_0 = is_transposed_0;
+ _is_transposed_1 = is_transposed_1;
// Configure kernel window
auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
@@ -136,7 +137,7 @@ void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &in
Iterator in0(_input0, window);
Iterator out(_output, window);
- GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_1, !_transform_1);
+ GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
index 225630434b..38b9102c20 100644
--- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
@@ -50,20 +50,21 @@ namespace arm_compute
namespace arm_compute
{
-void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+ bool is_transposed_1)
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _workspace = workspace;
- _alpha = alpha;
- _beta = beta;
- _transform_0 = transform_0;
- _transform_1 = transform_1;
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+ _workspace = workspace;
+ _alpha = alpha;
+ _beta = beta;
+ _is_transposed_0 = is_transposed_0;
+ _is_transposed_1 = is_transposed_1;
// Configure kernel window
Window win = calculate_max_window(*output->info());
@@ -105,7 +106,7 @@ void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info)
Iterator in0(_input0, window);
Iterator out(_output, window);
- GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+ GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
constexpr size_t alignment = 4096;
const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id;
void *workspace = _workspace->buffer() + offset;
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 25c639f7ea..5ca8eb8179 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -273,14 +273,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights,
_memory_group.manage(&_workspace);
// Configure matrix multiplication kernel
- if(_is_fully_connected_convolution)
- {
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f);
- }
- else
- {
- _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
- }
+ _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
_workspace.allocator()->allocate();
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 950f4c9899..03ba43f901 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -142,7 +142,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
_memory_group.manage(&_workspace);
// Configure matrix multiplication kernel
- _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
+ _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
_workspace.allocator()->allocate();
}
else
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 50aa5b6d11..c4028dca1d 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -84,7 +84,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
// Configure matrix multiplication kernel
auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
- k->configure(a, b, output, &_workspace, 1.f, 1.f);
+ k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
_mm_kernel = std::move(k);
}
else