diff options
16 files changed, 101 insertions, 100 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h index 9e0fe8059b..1090dd5b0a 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h @@ -36,7 +36,7 @@ class NEGEMMAssemblyBaseKernel : public INEKernel public: /** Constructor */ NEGEMMAssemblyBaseKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _transform_0(true), _transform_1(true) + : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false) { } @@ -55,22 +55,22 @@ public: * * The computed function is C = a * AxB + b * C. * - * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32 - * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0 - * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0. - * @param[out] workspace Space for intermediate results. - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of the accumulation. - * @param[in] transform_0 If true the kernel will transform @p input0 prior to the multiplication. - * @param[in] transform_1 If true the kernel will transform @p input1 prior to the multiplication. + * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32 + * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0 + * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0. + * @param[out] workspace Space for intermediate results. + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the accumulation. + * @param[in] is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false) + * @param[in] is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false) */ - void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool transform_0 = true, bool transform_1 = true) + void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false) { - internal_configure(input0, input1, output, workspace, alpha, beta, transform_0, transform_1); + internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1); } protected: - virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) = 0; + virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0; const ITensor *_input0; const ITensor *_input1; @@ -78,8 +78,8 @@ protected: ITensor *_workspace; float _alpha; float _beta; - bool _transform_0; - bool _transform_1; + bool _is_transposed_0; + bool _is_transposed_1; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMMASSEMBLYBASE_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h index 597acca439..7564f6a0e1 100644 --- a/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h +++ b/arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h @@ -38,7 +38,7 @@ public: void run(const Window &window, const ThreadInfo &info) override; protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMMAARCH32KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h index 77431d2bc8..5c29a825c2 100644 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h @@ -38,7 +38,7 @@ public: void run(const Window &window, const ThreadInfo &info) override; protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEGEMMAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h index 33cd2d42d0..8e9783720e 100644 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h @@ -43,10 +43,11 @@ public: void run(const Window &window, const ThreadInfo &info) override; protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; private: - using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, + using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, + const Window &window, const ThreadInfo &info); NEGEMMLowpAArch64A53 *_func; }; diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h index a93df033de..3829d5e1d6 100644 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h @@ -44,10 +44,11 @@ public: void run(const Window &window, const ThreadInfo &info) override; protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; private: - using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, + using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, + bool is_transposed_1, const Window &window, const ThreadInfo &info); NEGEMMLowpAArch64 *_func; }; diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h index b03e5fa1a2..b94499392e 100644 --- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h @@ -51,7 +51,7 @@ public: static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; }; } // namespace arm_compute #endif /* ARM_COMPUTE_AARCH64_V8_2 */ diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h index 9480a6a5d0..5671d99b85 100644 --- a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h +++ b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h @@ -38,7 +38,7 @@ public: void run(const Window &window, const ThreadInfo &info) override; protected: - void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) override; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/ diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp index ad0743b50f..bffcbbf436 100644 --- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp +++ b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp @@ -50,20 +50,20 @@ namespace arm_compute namespace arm_compute { -void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) +void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _transform_0 = transform_0; - _transform_1 = transform_1; + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; // Configure kernel window Window win = calculate_max_window(*output->info()); @@ -104,7 +104,7 @@ void NEGEMMAArch32Kernel::run(const Window &window, const ThreadInfo &info) Iterator in0(_input0, window); Iterator out(_output, window); - GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1); + GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; void *workspace = _workspace->buffer() + offset; diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp index d70524b6b8..0eaa9aa39b 100644 --- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp @@ -50,20 +50,20 @@ namespace arm_compute namespace arm_compute { -void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) +void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _transform_0 = transform_0; - _transform_1 = transform_1; + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; // Configure kernel window Window win = calculate_max_window(*output->info()); @@ -104,7 +104,7 @@ void NEGEMMAArch64Kernel::run(const Window &window, const ThreadInfo &info) Iterator in0(_input0, window); Iterator out(_output, window); - GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1); + GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; void *workspace = _workspace->buffer() + offset; diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp index e020cd9118..80606dcc07 100644 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp @@ -56,7 +56,8 @@ NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel() { } -void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, +void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, + const Window &window, const ThreadInfo &info) { const int lda = input0->info()->strides_in_bytes().y(); @@ -77,7 +78,7 @@ void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITe Iterator in0(input0, window); Iterator out(output, window); - GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1); + GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; @@ -99,7 +100,8 @@ void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITe in0, out); } -void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, +void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, + const Window &window, const ThreadInfo &info) { const int lda = input0->info()->strides_in_bytes().y(); @@ -120,7 +122,7 @@ void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITe Iterator in0(input0, window); Iterator out(output, window); - GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1); + GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; @@ -142,20 +144,21 @@ void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITe in0, out); } -void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) +void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, + bool is_transposed_1) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _transform_0 = transform_0; - _transform_1 = transform_1; + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; switch(input0->info()->data_type()) { @@ -192,7 +195,7 @@ void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &inf ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info); + (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info); } } // namespace arm_compute #endif /* ARM_COMPUTE_AARCH64_V8A */ diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp index db37201687..38f82f0407 100644 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp @@ -56,7 +56,7 @@ NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel() { } -void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, +void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window, const ThreadInfo &info) { const int lda = input0->info()->strides_in_bytes().y(); @@ -77,7 +77,7 @@ void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor * Iterator in0(input0, window); Iterator out(output, window); - GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1); + GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; @@ -99,7 +99,7 @@ void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor * in0, out); } -void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, +void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window, const ThreadInfo &info) { const int lda = input0->info()->strides_in_bytes().y(); @@ -120,7 +120,7 @@ void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor * Iterator in0(input0, window); Iterator out(output, window); - GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1); + GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; @@ -142,20 +142,21 @@ void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor * in0, out); } -void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) +void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, + bool is_transposed_1) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _transform_0 = transform_0; - _transform_1 = transform_1; + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; switch(input0->info()->data_type()) { @@ -192,7 +193,7 @@ void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info); + (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info); } } // namespace arm_compute #endif /* ARM_COMPUTE_AARCH64_V8A */ diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp index e996e571ab..099934a49d 100644 --- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp @@ -84,20 +84,21 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe namespace arm_compute { -void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) +void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, + bool is_transposed_1) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info())); - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _transform_0 = transform_0; - _transform_1 = transform_1; + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; // Configure kernel window auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info()); @@ -136,7 +137,7 @@ void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &in Iterator in0(_input0, window); Iterator out(_output, window); - GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_1, !_transform_1); + GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp index 225630434b..38b9102c20 100644 --- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp +++ b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp @@ -50,20 +50,21 @@ namespace arm_compute namespace arm_compute { -void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) +void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, + bool is_transposed_1) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - _input0 = input0; - _input1 = input1; - _output = output; - _workspace = workspace; - _alpha = alpha; - _beta = beta; - _transform_0 = transform_0; - _transform_1 = transform_1; + _input0 = input0; + _input1 = input1; + _output = output; + _workspace = workspace; + _alpha = alpha; + _beta = beta; + _is_transposed_0 = is_transposed_0; + _is_transposed_1 = is_transposed_1; // Configure kernel window Window win = calculate_max_window(*output->info()); @@ -105,7 +106,7 @@ void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info) Iterator in0(_input0, window); Iterator out(_output, window); - GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1); + GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1); constexpr size_t alignment = 4096; const size_t offset = (gemm.get_working_size() + alignment - 1) * info.thread_id; void *workspace = _workspace->buffer() + offset; diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 25c639f7ea..5ca8eb8179 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -273,14 +273,7 @@ void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, _memory_group.manage(&_workspace); // Configure matrix multiplication kernel - if(_is_fully_connected_convolution) - { - _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f); - } - else - { - _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace); - } + _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace); _workspace.allocator()->allocate(); } diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index 950f4c9899..03ba43f901 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -142,7 +142,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe _memory_group.manage(&_workspace); // Configure matrix multiplication kernel - _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f); + _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */); _workspace.allocator()->allocate(); } else diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 50aa5b6d11..c4028dca1d 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -84,7 +84,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, // Configure matrix multiplication kernel auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>(); - k->configure(a, b, output, &_workspace, 1.f, 1.f); + k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false); _mm_kernel = std::move(k); } else |