COMPMID-2577: Fuse bias addition and activation in gemm assembly kernels

Change-Id: I7f52112d2d05b1ea3d3f3d4b19b8eafab05d6c44 Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/2141 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
author: Georgios Pinitas <georgios.pinitas@arm.com> 2019-10-14 19:03:09 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2019-10-23 12:08:12 +0000
commit: 48b3ef89de5f21a0169d8416e3d54081f82c7bf8 (patch)
tree: f857d733ccf446c704823dc7ac796a96eb55095e /src/core/NEON/kernels/assembly
parent: 1dce3101ef8d77c8cf0af7dfd4af6595a0136b91 (diff)
download: ComputeLibrary-48b3ef89de5f21a0169d8416e3d54081f82c7bf8.tar.gz
3 files changed, 5 insertions, 368 deletions
diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
index 3d8d66d7fc..93ea6c8d5e 100644
--- a/src/core/NEON/kernels/assembly/Helpers.cpp
+++ b/src/core/NEON/kernels/assembly/Helpers.cpp
@@ -24,16 +24,13 @@
 
 #include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
 namespace arm_compute
 {
 arm_gemm::KernelDescription get_gemm_info(DataType                            input_type,
                                           const CPUInfo                      &ci,
                                           const unsigned int                  num_threads,
                                           const INEGEMMWrapperKernel::Params &p,
-                                          float                               alpha,
-                                          float                               beta,
+                                          arm_gemm::Activation                activation,
                                           bool                                pretranspose_hint)
 {
     switch(input_type)
@@ -42,25 +39,25 @@ arm_gemm::KernelDescription get_gemm_info(DataType                            in
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            arm_gemm::GemmArgs<uint32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
             return arm_gemm::get_gemm_method<uint8_t, uint32_t>(args);
         }
         case DataType::S8:
         {
-            arm_gemm::GemmArgs<int32_t> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
             return arm_gemm::get_gemm_method<int8_t, int32_t>(args);
         }
 #endif // __aarch64__
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
-            arm_gemm::GemmArgs<__fp16> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
             return arm_gemm::get_gemm_method<__fp16, __fp16>(args);
         }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
         {
-            arm_gemm::GemmArgs<float> args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint);
+            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
             return arm_gemm::get_gemm_method<float, float>(args);
         }
         default:
diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
deleted file mode 100644
index 6e30148b5d..0000000000
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__
-#define __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__
-
-#include "../arm_gemm/utils.hpp"
-#include "arm_gemm.hpp"
-
-#include "../arm_gemm/mergeresults.hpp"
-#include "../arm_gemm/transform.hpp"
-
-#include "../arm_gemm/kernels/a32_sgemm_8x6.hpp"
-#include "../arm_gemm/kernels/a64_gemm_s8_12x8.hpp"
-#include "../arm_gemm/kernels/a64_gemm_s8_4x4.hpp"
-#include "../arm_gemm/kernels/a64_gemm_u8_12x8.hpp"
-#include "../arm_gemm/kernels/a64_gemm_u8_4x4.hpp"
-#include "../arm_gemm/kernels/a64_hgemm_24x8.hpp"
-#include "../arm_gemm/kernels/a64_sgemm_12x8.hpp"
-#include "../arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
-#include "../arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
-#include "../arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
-#include "../arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
-
-namespace arm_compute
-{
-namespace detail
-{
-/** GEMM Interleaved Strategy interface */
-class IInterleavedStrategy
-{
-public:
-    /** Virtual Destructor */
-    virtual ~IInterleavedStrategy() = default;
-    /** Return output height of the interleaved strategy
-     *
-     * @return Output height of strategy
-     */
-    virtual unsigned int out_height() const = 0;
-    /** Instantiate and configure a prepareB Kernel
-     *
-     * @param[in] b             Input tensor B.
-     * @param[in] transformed_b Reshaped tensor B.
-     * @param[in] params        GM, N, K sizes.
-     * @param[in] ci            CPUInfo to be used for kernel configuration.
-     *
-     * @return A wrapped specialized prepareB kernel
-     */
-    virtual std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor                      *b,
-                                                                                         ITensor                            *transformed_b,
-                                                                                         const INEGEMMWrapperKernel::Params &params,
-                                                                                         const CPUInfo                      &ci) = 0;
-    /** Instantiate and configure a transformA Kernel
-     *
-     * @param[in] a             Input tensor A.
-     * @param[in] transformed_a Reshaped tensor A.
-     * @param[in] block_walker  Window representing the layout of the matrix's blocks.
-     * @param[in] params        M, N, K sizes.
-     * @param[in] gemm_info     GEMM meta-data
-     *
-     * @return A wrapped specialized transformA kernel
-     */
-    virtual std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor                      *a,
-                                                                                       ITensor                            *transformed_a,
-                                                                                       const Window                       &block_walker,
-                                                                                       const INEGEMMWrapperKernel::Params &params,
-                                                                                       const GEMMInfo                     &gemm_info) = 0;
-    /** Instantiate and configure a prepareB Kernel
-     *
-     * @param[in] transformed_a Already reshaped tensor A.
-     * @param[in] transformed_b Already reshaped tensor B.
-     * @param[in] tmp_c         Temporary buffer to be used to store intermediate results.
-     * @param[in] c             Result tensor C.
-     * @param[in] block_walker  Window containing iteration information for the M and batch dimensions.
-     * @param[in] block_sizes   Block sizes to use for the matrix multiplication (A & B must have been reshaped using these same block sizes).
-     * @param[in] params        M, N, K sizes.
-     * @param[in] alpha         Alpha value
-     * @param[in] beta          Beta value
-     * @param[in] gemm_info     GEMM meta-data
-     * @param[in] num_threads   Maximum number of threads that might be used for the calculations.
-     *
-     * @return A wrapped specialized MatrixMultiply kernel
-     */
-    virtual std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
-                                                                                                const Window &block_walker, const BlockSizes &block_sizes,
-                                                                                                const INEGEMMWrapperKernel::Params &params, float alpha, float beta, const GEMMInfo &gemm_info,
-                                                                                                unsigned int num_threads) = 0;
-    /** Calculates the block sizes of a given strategy
-     *
-     * @param[in] ci     CPUInfo to be used for kernel configuration.
-     * @param[in] params M, N, K sizes.
-     *
-     * @return BlockSizes for a given strategy
-     */
-    virtual BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params) = 0;
-};
-
-/** Interleaved Strategy class */
-template <typename StrategyType>
-class InterleavedStrategy : public IInterleavedStrategy
-{
-public:
-    using strategy = StrategyType;
-
-public:
-    // Inherited methods overridden
-    unsigned int out_height() const override
-    {
-        return strategy::out_height();
-    }
-    std::unique_ptr<NEGEMMInterleavedPrepareBWrapperKernel> instantiate_prepareB(const ITensor                      *b,
-                                                                                 ITensor                            *transformed_b,
-                                                                                 const INEGEMMWrapperKernel::Params &params,
-                                                                                 const CPUInfo                      &ci) override
-    {
-        auto prepare_b = support::cpp14::make_unique<NEGEMMInterleavedPrepareBWrapperKernelTemplate<strategy>>();
-        prepare_b->configure(b, transformed_b, false, ci, params);
-        return std::move(prepare_b);
-    }
-    std::unique_ptr<NEGEMMInterleavedTransformAWrapper> instantiate_transformA(const ITensor                      *a,
-                                                                               ITensor                            *transformed_a,
-                                                                               const Window                       &block_walker,
-                                                                               const INEGEMMWrapperKernel::Params &params,
-                                                                               const GEMMInfo                     &gemm_info) override
-    {
-        auto transform_a = support::cpp14::make_unique<NEGEMMInterleavedTransformAWrapperTemplate<strategy>>();
-        transform_a->configure(a, transformed_a, false, gemm_info.reinterpret_input_as_3d(), block_walker, params);
-        return std::move(transform_a);
-    }
-    std::unique_ptr<NEGEMMInterleavedMatrixMultiplyWrapper> instantiate_matrix_multiply(const ITensor *transformed_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c,
-                                                                                        const Window &block_walker, const BlockSizes &block_sizes,
-                                                                                        const INEGEMMWrapperKernel::Params &params, float alpha, float beta, const GEMMInfo &gemm_info,
-                                                                                        unsigned int num_threads) override
-    {
-        auto matrix_multiply = support::cpp14::make_unique<NEGEMMInterleavedMatrixMultiplyWrapperTemplate<strategy>>();
-        matrix_multiply->configure(transformed_a, transformed_b, tmp_c, c, block_walker, block_sizes, params, gemm_info, alpha, beta, num_threads);
-        return std::move(matrix_multiply);
-    }
-
-    BlockSizes calculate_block_sizes_for_strategy(const CPUInfo &ci, const INEGEMMWrapperKernel::Params &params) override
-    {
-        return calculate_block_sizes<strategy>(ci, params.M, params.N, params.K);
-    }
-};
-
-/** Create the backend GEMM strategy to use given the provided kernel info
- *
- * @param[in] kernel_name Kernel name of the backend strategy to instantiate
- *
- * @return The requested kernel strategy if exists else nullptr
- */
-std::unique_ptr<IInterleavedStrategy> create_strategy(const std::string &kernel_name)
-{
-#if defined(__arm__)
-    if(kernel_name.find("sgemm_8x6") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_8x6>>();
-    }
-#endif // defined(__arm__)
-#if defined(__aarch64__)
-    if(kernel_name.find("gemm_s8_4x4") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_4x4>>();
-    }
-    if(kernel_name.find("gemm_s8_12x8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_s8_12x8>>();
-    }
-    if(kernel_name.find("gemm_u8_4x4") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_4x4>>();
-    }
-    if(kernel_name.find("gemm_u8_12x8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::gemm_u8_12x8>>();
-    }
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    if(kernel_name.find("hgemm_24x8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::hgemm_24x8>>();
-    }
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    if(kernel_name.find("sgemm_12x8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::sgemm_12x8>>();
-    }
-#if defined(__ARM_FEATURE_SVE)
-    if(kernel_name.find("interleaved_fp16_mla_3VLx8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp16_mla_3VLx8>>();
-    }
-    if(kernel_name.find("interleaved_fp32_mla_3VLx8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_fp32_mla_3VLx8>>();
-    }
-    if(kernel_name.find("interleaved_s8s32_dot_3VLx8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_s8s32_dot_3VLx8>>();
-    }
-    if(kernel_name.find("interleaved_u8u32_dot_3VLx8") != std::string::npos)
-    {
-        return support::cpp14::make_unique<InterleavedStrategy<arm_gemm::interleaved_u8u32_dot_3VLx8>>();
-    }
-#endif // defined(__ARM_FEATURE_SVE)
-#endif // defined(__aarch64__)_
-    return nullptr;
-}
-} // namespace detail
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEGEMMINTERLEAVEDSTRATEGIES_H__ */
diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
deleted file mode 100644
index ecdb5a938c..0000000000
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/WindowIterator.h"
-
-#include "../arm_gemm/utils.hpp"
-#include "arm_gemm.hpp"
-
-#include "../arm_gemm/mergeresults.hpp"
-#include "../arm_gemm/transform.hpp"
-
-#include "../arm_gemm/kernels/a64_sgemm_native_16x4.hpp"
-
-namespace arm_compute
-{
-namespace
-{
-template <typename To, typename Tr>
-struct Kernel
-{
-};
-
-#ifdef __aarch64__
-template <>
-struct Kernel<float, float>
-{
-    using strategy = arm_gemm::sgemm_native_16x4;
-};
-#endif /* __aarch64__ */
-
-} // namespace
-
-template <typename To, typename Tr>
-Window NEGEMMNativeWrapperKernel<To, Tr>::configure_internal(float alpha, float beta)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    using strategy = typename Kernel<To, Tr>::strategy;
-
-    _beta = beta;
-
-    //Note: The window is shifted down by 1 dimension compare to the tensors
-    Window window;
-    window.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.M, strategy::out_height()), strategy::out_height()));
-    window.set(Window::DimY, Window::Dimension(0, _params.batches));
-    window.set(Window::DimZ, Window::Dimension(0, _params.multis));
-
-    return window;
-}
-
-template <typename To, typename Tr>
-void NEGEMMNativeWrapperKernel<To, Tr>::run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info)
-{
-    using strategy = typename Kernel<To, Tr>::strategy;
-
-    TensorAccessor<To> a(*_a);
-    TensorAccessor<To> b(*_b);
-    TensorAccessor<Tr> c(*_c);
-
-    // Handle 3d input re-interpretation
-    if(_gemm_info.reinterpret_input_as_3d())
-    {
-        Strides a_strides_as_3d = _a->info()->strides_in_bytes();
-        a_strides_as_3d.remove(Window::DimZ);
-        a.set_strides(a_strides_as_3d);
-    }
-
-    // Handle 3d output re-interpretation
-    if(_gemm_info.depth_output_gemm3d() != 0)
-    {
-        Strides c_strides_as_3d = _c->info()->strides_in_bytes();
-        c_strides_as_3d.remove(Window::DimZ);
-        c.set_strides(c_strides_as_3d);
-    }
-
-    unsigned int m_end = 0;
-
-    strategy strat(info.cpu_info);
-    auto window_iterator = arm_compute::create_window_iterator(window, start_offset, end_offset, [&](const Coordinates & id)
-    {
-        const unsigned int y0    = id.x();
-        const unsigned int batch = id.y();
-        const unsigned int multi = id.z();
-        const unsigned int ymax  = std::min(y0 + strategy::out_height(), m_end);
-
-        // TODO(COMPMID-1424) : Agree on gemm IO layouts
-        strat.kernel(a(0, y0, batch, multi), a.stride(Window::DimY),
-                     b(0, 0, multi), b.stride(Window::DimY),
-                     c(0, y0, batch, multi), c.stride(Window::DimY),
-                     _beta, (ymax - y0), _params.N, _params.K);
-    });
-
-    auto on_new_row_size = [&](unsigned int start, unsigned int end)
-    {
-        ARM_COMPUTE_UNUSED(start);
-        m_end = std::min(end, _params.M);
-    };
-
-    window_iterator.iterate_3D(on_new_row_size);
-}
-
-#ifdef __aarch64__
-template class NEGEMMNativeWrapperKernel<float, float>;
-#endif /* __aarch64__ */
-
-} // namespace arm_compute
author	Georgios Pinitas <georgios.pinitas@arm.com>	2019-10-14 19:03:09 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2019-10-23 12:08:12 +0000
commit	48b3ef89de5f21a0169d8416e3d54081f82c7bf8 (patch)
tree	f857d733ccf446c704823dc7ac796a96eb55095e /src/core/NEON/kernels/assembly
parent	1dce3101ef8d77c8cf0af7dfd4af6595a0136b91 (diff)
download	ComputeLibrary-48b3ef89de5f21a0169d8416e3d54081f82c7bf8.tar.gz