diff options
author | Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-27 17:46:17 +0100 |
---|---|---|
committer | felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> | 2023-09-28 12:08:05 +0000 |
commit | afd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch) | |
tree | 03bc7d5a762099989b16a656fa8d397b490ed70e /src/cpu/kernels/assembly | |
parent | bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff) | |
download | ComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz |
Apply clang-format on repository
Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.
Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/
There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.
Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/cpu/kernels/assembly')
-rw-r--r-- | src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h | 12 | ||||
-rw-r--r-- | src/cpu/kernels/assembly/arm_gemm.hpp | 91 | ||||
-rw-r--r-- | src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp | 38 | ||||
-rw-r--r-- | src/cpu/kernels/assembly/gemm_common.hpp | 74 | ||||
-rw-r--r-- | src/cpu/kernels/assembly/ndrange.hpp | 19 |
5 files changed, 148 insertions, 86 deletions
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h index 10bf8e4ff7..6e8f32ef47 100644 --- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/NEON/INEKernel.h" #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" @@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel public: /** Constructor */ - CpuGemmAssemblyWrapperKernel() - : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") + CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") { } - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete; const char *name() const override @@ -110,7 +110,7 @@ public: INEKernel::configure(win); - if(!kernel_name_tag.empty()) + if (!kernel_name_tag.empty()) { _name += "/" + kernel_name_tag; } @@ -132,7 +132,7 @@ public: private: arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel; - std::string _name; + std::string _name; }; } // namespace kernel } // namespace cpu diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp index 4c127b4ec3..9a913c5c58 100644 --- a/src/cpu/kernels/assembly/arm_gemm.hpp +++ b/src/cpu/kernels/assembly/arm_gemm.hpp @@ -23,13 +23,12 @@ */ #pragma once +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" #include <cstring> #include <memory> #include <vector> -#include "arm_gemm_local.hpp" -#include "gemm_common.hpp" - namespace arm_gemm { enum class GemmMethod @@ -111,8 +110,7 @@ struct GemmConfig unsigned int outer_block_size = 0; WeightFormat weight_format = WeightFormat::ANY; - GemmConfig(GemmMethod method) - : method(method) + GemmConfig(GemmMethod method) : method(method) { } GemmConfig() @@ -133,8 +131,7 @@ struct Activation float param1; float param2; - Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) - : type(type), param1(p1), param2(p2) + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2) { } }; @@ -156,12 +153,32 @@ public: bool _fast_mode; const GemmConfig *_cfg; - GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, - unsigned int K, unsigned int Ksections, unsigned int nbatches, - unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, - bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr) - : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), - _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg) + GemmArgs(const CPUInfo *ci, + unsigned int M, + unsigned int N, + unsigned int K, + unsigned int Ksections, + unsigned int nbatches, + unsigned int nmulti, + bool indirect_input, + Activation act, + const int maxthreads, + bool fixed_format = false, + bool fast_mode = false, + const GemmConfig *cfg = nullptr) + : _ci(ci), + _Msize(M), + _Nsize(N), + _Ksize(K), + _Ksections(Ksections), + _nbatches(nbatches), + _nmulti(nmulti), + _indirect_input(indirect_input), + _act(act), + _maxthreads(maxthreads), + _fixed_format(fixed_format), + _fast_mode(fast_mode), + _cfg(cfg) { } }; @@ -187,23 +204,51 @@ public: Requantize32() = default; // Constructor for per-tensor quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, - int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)), - per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv) + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, + int32_t requant_shift, + int32_t requant_mul, + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(false), + per_layer_left_shift(std::max<int32_t>(requant_shift, 0)), + per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), + per_layer_mul(requant_mul), + minval(minv), + maxval(maxv) { } // Constructor for per-channel quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, const int32_t *requant_left_shifts, const int32_t *requant_right_shifts, const int32_t *requant_muls, - int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts), - per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv) + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(true), + per_channel_left_shifts(requant_left_shifts), + per_channel_right_shifts(requant_right_shifts), + per_channel_muls(requant_muls), + minval(minv), + maxval(maxv) { } }; diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp index 718fcd1fb4..0672e899b6 100644 --- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp +++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp @@ -27,7 +27,6 @@ #include "arm_compute/core/Window.h" #include "ndrange.hpp" - #include <cassert> /* This file contains mapping between integral types used in arm_compute and arm_gemm @@ -38,8 +37,7 @@ namespace arm_gemm { //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library -constexpr std::size_t ndrange_max = - arm_compute::Dimensions<unsigned int>::num_max_dimensions; +constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions; using ndrange_t = NDRange<ndrange_max>; using ndcoord_t = NDCoordinate<ndrange_max>; @@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr) { arm_compute::Window win; - for(unsigned int i = 0; i != ndrange_max; ++i) + for (unsigned int i = 0; i != ndrange_max; ++i) { //populate the window with the dimensions of the NDRange win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); @@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc) { arm_compute::Window win; - for(unsigned int i = 0; i != ndrange_max; ++i) + for (unsigned int i = 0; i != ndrange_max; ++i) { const auto start = ndc.get_position(i); const auto size = ndc.get_size(i); @@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc) */ inline ndrange_t to_ndrange(const arm_compute::Window &win) { - return - { - static_cast<unsigned int>(win[0].end() - win[0].start()), - static_cast<unsigned int>(win[1].end() - win[1].start()), - static_cast<unsigned int>(win[2].end() - win[2].start()), - static_cast<unsigned int>(win[3].end() - win[3].start()), - static_cast<unsigned int>(win[4].end() - win[4].start()), - static_cast<unsigned int>(win[5].end() - win[5].start()) - }; + return {static_cast<unsigned int>(win[0].end() - win[0].start()), + static_cast<unsigned int>(win[1].end() - win[1].start()), + static_cast<unsigned int>(win[2].end() - win[2].start()), + static_cast<unsigned int>(win[3].end() - win[3].start()), + static_cast<unsigned int>(win[4].end() - win[4].start()), + static_cast<unsigned int>(win[5].end() - win[5].start())}; } /** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions @@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win) */ inline ndcoord_t to_ndcoord(const arm_compute::Window &win) { - return - { - { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) }, - { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) }, - { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) }, - { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) }, - { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) }, - { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) } - }; + return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())}, + {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())}, + {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())}, + {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())}, + {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())}, + {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}}; } } //namespace arm_gemm diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp index 834cd1061e..6fe9f13f02 100644 --- a/src/cpu/kernels/assembly/gemm_common.hpp +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -25,7 +25,6 @@ #include "convolution_parameters.hpp" #include "ndrange.hpp" - #include <cstddef> namespace arm_gemm @@ -51,10 +50,19 @@ public: * appropriately typed pointers. If B is pretransposed (see below) then * the settings for B here are ignored. */ - virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; + virtual void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) = 0; /** @returns an ndrange containing ranges of the compute space which can be * broken up and parallelised over @@ -73,7 +81,7 @@ public: * This has an empty default implementation, as GEMMs which don't care * about thread count can safely ignore this. */ - virtual void set_nthreads(int) {}; + virtual void set_nthreads(int){}; /* Whether this GEMM can be dynamically scheduled or not. */ virtual bool supports_dynamic_scheduling() const @@ -95,7 +103,7 @@ public: return 0; } /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void set_working_space(void *) {}; + virtual void set_working_space(void *){}; /*** "Pretransposed" interface (optional) ***/ /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ @@ -122,7 +130,8 @@ public: /* The "real" version of this depends on the templated operand type (see below). */ virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; /* Threaded version with window start/end parameters */ - virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; + virtual void + pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ virtual void set_pretransposed_B_data(void *) @@ -186,10 +195,19 @@ protected: public: /* Pass in the pointers to the arrays to be operated on and their * strides (templated version with appropriate types). */ - virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) + virtual void set_arrays(const To *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const To *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + Tr *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const Tr *bias, + /* no row or batch stride needed */ const int bias_multi_stride) { _Aptr = A; _lda = lda; @@ -207,25 +225,33 @@ public: } /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override + void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) override { - set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, - static_cast<const To *>(B), ldb, B_multi_stride, - static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, + set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb, + B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, static_cast<const Tr *>(bias), bias_multi_stride); } /*** "Pretransposed" interface ***/ /* Compute col sums over all columns */ - virtual void requantize_bias(void *, const To *, const int, const int) {}; + virtual void requantize_bias(void *, const To *, const int, const int){}; /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; + virtual void pretranspose_B_array(void *, const To *, const int, const int){}; /* Implementation of the void * overload which casts its arguments to the appropriate type. */ void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override @@ -237,12 +263,14 @@ public: * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and * just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only * legal values for start and end are 0 and 1 respectively. */ - virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) + virtual void + pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) { pretranspose_B_array(out, in, row_stride, multi_stride); }; - void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override + void pretranspose_B_array_part_generic( + void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override { pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end); } diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp index 1c8261aef7..baccdc0d88 100644 --- a/src/cpu/kernels/assembly/ndrange.hpp +++ b/src/cpu/kernels/assembly/ndrange.hpp @@ -45,8 +45,7 @@ private: unsigned int m_end = 0; public: - NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) - : m_parent(p), m_pos(s), m_end(e) + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } @@ -59,12 +58,12 @@ private: { unsigned int r = m_pos; - if(d < (D - 1)) + if (d < (D - 1)) { r %= m_parent.m_totalsizes[d]; } - if(d > 0) + if (d > 0) { r /= m_parent.m_totalsizes[d - 1]; } @@ -98,9 +97,9 @@ private: { unsigned int t = 1; - for(unsigned int i = 0; i < D; i++) + for (unsigned int i = 0; i < D; i++) { - if(m_sizes[i] == 0) + if (m_sizes[i] == 0) { m_sizes[i] = 1; } @@ -116,14 +115,12 @@ public: NDRange(const NDRange &rhs) = default; template <typename... T> - NDRange(T... ts) - : m_sizes{ ts... } + NDRange(T... ts) : m_sizes{ts...} { set_totalsizes(); } - NDRange(const std::array<unsigned int, D> &n) - : m_sizes(n) + NDRange(const std::array<unsigned int, D> &n) : m_sizes(n) { set_totalsizes(); } @@ -163,7 +160,7 @@ public: std::array<int_t, N> sizes{}; std::size_t i = 0; - for(auto &p : list) + for (auto &p : list) { m_positions[i] = p.first; sizes[i++] = p.second; |