aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/assembly
diff options
context:
space:
mode:
authorFelix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-27 17:46:17 +0100
committerfelixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-28 12:08:05 +0000
commitafd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree03bc7d5a762099989b16a656fa8d397b490ed70e /src/cpu/kernels/assembly
parentbdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
downloadComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz
Apply clang-format on repository
Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/cpu/kernels/assembly')
-rw-r--r--src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h12
-rw-r--r--src/cpu/kernels/assembly/arm_gemm.hpp91
-rw-r--r--src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp38
-rw-r--r--src/cpu/kernels/assembly/gemm_common.hpp74
-rw-r--r--src/cpu/kernels/assembly/ndrange.hpp19
5 files changed, 148 insertions, 86 deletions
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 10bf8e4ff7..6e8f32ef47 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/NEON/INEKernel.h"
#include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
@@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel
public:
/** Constructor
*/
- CpuGemmAssemblyWrapperKernel()
- : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
+ CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
{
}
- CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete;
- CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
+ CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete;
+ CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
const char *name() const override
@@ -110,7 +110,7 @@ public:
INEKernel::configure(win);
- if(!kernel_name_tag.empty())
+ if (!kernel_name_tag.empty())
{
_name += "/" + kernel_name_tag;
}
@@ -132,7 +132,7 @@ public:
private:
arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
- std::string _name;
+ std::string _name;
};
} // namespace kernel
} // namespace cpu
diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 4c127b4ec3..9a913c5c58 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp
@@ -23,13 +23,12 @@
*/
#pragma once
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
#include <cstring>
#include <memory>
#include <vector>
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
namespace arm_gemm
{
enum class GemmMethod
@@ -111,8 +110,7 @@ struct GemmConfig
unsigned int outer_block_size = 0;
WeightFormat weight_format = WeightFormat::ANY;
- GemmConfig(GemmMethod method)
- : method(method)
+ GemmConfig(GemmMethod method) : method(method)
{
}
GemmConfig()
@@ -133,8 +131,7 @@ struct Activation
float param1;
float param2;
- Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
- : type(type), param1(p1), param2(p2)
+ Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2)
{
}
};
@@ -156,12 +153,32 @@ public:
bool _fast_mode;
const GemmConfig *_cfg;
- GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
- unsigned int K, unsigned int Ksections, unsigned int nbatches,
- unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
- bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr)
- : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads),
- _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg)
+ GemmArgs(const CPUInfo *ci,
+ unsigned int M,
+ unsigned int N,
+ unsigned int K,
+ unsigned int Ksections,
+ unsigned int nbatches,
+ unsigned int nmulti,
+ bool indirect_input,
+ Activation act,
+ const int maxthreads,
+ bool fixed_format = false,
+ bool fast_mode = false,
+ const GemmConfig *cfg = nullptr)
+ : _ci(ci),
+ _Msize(M),
+ _Nsize(N),
+ _Ksize(K),
+ _Ksections(Ksections),
+ _nbatches(nbatches),
+ _nmulti(nmulti),
+ _indirect_input(indirect_input),
+ _act(act),
+ _maxthreads(maxthreads),
+ _fixed_format(fixed_format),
+ _fast_mode(fast_mode),
+ _cfg(cfg)
{
}
};
@@ -187,23 +204,51 @@ public:
Requantize32() = default;
// Constructor for per-tensor quantization
- Requantize32(const int32_t *bias, size_t bias_multi_stride,
- int32_t a_offset, int32_t b_offset, int32_t c_offset,
- int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
- per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+ Requantize32(const int32_t *bias,
+ size_t bias_multi_stride,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t c_offset,
+ int32_t requant_shift,
+ int32_t requant_mul,
+ int32_t minv,
+ int32_t maxv)
+ : bias(bias),
+ bias_multi_stride(bias_multi_stride),
+ a_offset(a_offset),
+ b_offset(b_offset),
+ c_offset(c_offset),
+ per_channel_requant(false),
+ per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+ per_layer_right_shift(std::min<int32_t>(requant_shift, 0)),
+ per_layer_mul(requant_mul),
+ minval(minv),
+ maxval(maxv)
{
}
// Constructor for per-channel quantization
- Requantize32(const int32_t *bias, size_t bias_multi_stride,
- int32_t a_offset, int32_t b_offset, int32_t c_offset,
+ Requantize32(const int32_t *bias,
+ size_t bias_multi_stride,
+ int32_t a_offset,
+ int32_t b_offset,
+ int32_t c_offset,
const int32_t *requant_left_shifts,
const int32_t *requant_right_shifts,
const int32_t *requant_muls,
- int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
- per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+ int32_t minv,
+ int32_t maxv)
+ : bias(bias),
+ bias_multi_stride(bias_multi_stride),
+ a_offset(a_offset),
+ b_offset(b_offset),
+ c_offset(c_offset),
+ per_channel_requant(true),
+ per_channel_left_shifts(requant_left_shifts),
+ per_channel_right_shifts(requant_right_shifts),
+ per_channel_muls(requant_muls),
+ minval(minv),
+ maxval(maxv)
{
}
};
diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
index 718fcd1fb4..0672e899b6 100644
--- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/Window.h"
#include "ndrange.hpp"
-
#include <cassert>
/* This file contains mapping between integral types used in arm_compute and arm_gemm
@@ -38,8 +37,7 @@
namespace arm_gemm
{
//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
- arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions;
using ndrange_t = NDRange<ndrange_max>;
using ndcoord_t = NDCoordinate<ndrange_max>;
@@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr)
{
arm_compute::Window win;
- for(unsigned int i = 0; i != ndrange_max; ++i)
+ for (unsigned int i = 0; i != ndrange_max; ++i)
{
//populate the window with the dimensions of the NDRange
win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
@@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
{
arm_compute::Window win;
- for(unsigned int i = 0; i != ndrange_max; ++i)
+ for (unsigned int i = 0; i != ndrange_max; ++i)
{
const auto start = ndc.get_position(i);
const auto size = ndc.get_size(i);
@@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc)
*/
inline ndrange_t to_ndrange(const arm_compute::Window &win)
{
- return
- {
- static_cast<unsigned int>(win[0].end() - win[0].start()),
- static_cast<unsigned int>(win[1].end() - win[1].start()),
- static_cast<unsigned int>(win[2].end() - win[2].start()),
- static_cast<unsigned int>(win[3].end() - win[3].start()),
- static_cast<unsigned int>(win[4].end() - win[4].start()),
- static_cast<unsigned int>(win[5].end() - win[5].start())
- };
+ return {static_cast<unsigned int>(win[0].end() - win[0].start()),
+ static_cast<unsigned int>(win[1].end() - win[1].start()),
+ static_cast<unsigned int>(win[2].end() - win[2].start()),
+ static_cast<unsigned int>(win[3].end() - win[3].start()),
+ static_cast<unsigned int>(win[4].end() - win[4].start()),
+ static_cast<unsigned int>(win[5].end() - win[5].start())};
}
/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
@@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win)
*/
inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
{
- return
- {
- { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
- { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
- { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
- { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
- { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
- { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
- };
+ return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())},
+ {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())},
+ {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())},
+ {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())},
+ {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())},
+ {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}};
}
} //namespace arm_gemm
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 834cd1061e..6fe9f13f02 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -25,7 +25,6 @@
#include "convolution_parameters.hpp"
#include "ndrange.hpp"
-
#include <cstddef>
namespace arm_gemm
@@ -51,10 +50,19 @@ public:
* appropriately typed pointers. If B is pretransposed (see below) then
* the settings for B here are ignored.
*/
- virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+ virtual void set_arrays_generic(const void *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const void *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ void *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const void *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride) = 0;
/** @returns an ndrange containing ranges of the compute space which can be
* broken up and parallelised over
@@ -73,7 +81,7 @@ public:
* This has an empty default implementation, as GEMMs which don't care
* about thread count can safely ignore this.
*/
- virtual void set_nthreads(int) {};
+ virtual void set_nthreads(int){};
/* Whether this GEMM can be dynamically scheduled or not. */
virtual bool supports_dynamic_scheduling() const
@@ -95,7 +103,7 @@ public:
return 0;
}
/* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
- virtual void set_working_space(void *) {};
+ virtual void set_working_space(void *){};
/*** "Pretransposed" interface (optional) ***/
/* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */
@@ -122,7 +130,8 @@ public:
/* The "real" version of this depends on the templated operand type (see below). */
virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
/* Threaded version with window start/end parameters */
- virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+ virtual void
+ pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
/* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
virtual void set_pretransposed_B_data(void *)
@@ -186,10 +195,19 @@ protected:
public:
/* Pass in the pointers to the arrays to be operated on and their
* strides (templated version with appropriate types). */
- virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+ virtual void set_arrays(const To *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const To *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ Tr *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const Tr *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride)
{
_Aptr = A;
_lda = lda;
@@ -207,25 +225,33 @@ public:
}
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
- void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+ void set_arrays_generic(const void *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const void *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ void *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const void *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride) override
{
- set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
- static_cast<const To *>(B), ldb, B_multi_stride,
- static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+ set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+ B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
static_cast<const Tr *>(bias), bias_multi_stride);
}
/*** "Pretransposed" interface ***/
/* Compute col sums over all columns */
- virtual void requantize_bias(void *, const To *, const int, const int) {};
+ virtual void requantize_bias(void *, const To *, const int, const int){};
/* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
/* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
- virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
+ virtual void pretranspose_B_array(void *, const To *, const int, const int){};
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
@@ -237,12 +263,14 @@ public:
* The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
* just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only
* legal values for start and end are 0 and 1 respectively. */
- virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+ virtual void
+ pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
{
pretranspose_B_array(out, in, row_stride, multi_stride);
};
- void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+ void pretranspose_B_array_part_generic(
+ void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
{
pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
}
diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp
index 1c8261aef7..baccdc0d88 100644
--- a/src/cpu/kernels/assembly/ndrange.hpp
+++ b/src/cpu/kernels/assembly/ndrange.hpp
@@ -45,8 +45,7 @@ private:
unsigned int m_end = 0;
public:
- NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
- : m_parent(p), m_pos(s), m_end(e)
+ NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e)
{
}
@@ -59,12 +58,12 @@ private:
{
unsigned int r = m_pos;
- if(d < (D - 1))
+ if (d < (D - 1))
{
r %= m_parent.m_totalsizes[d];
}
- if(d > 0)
+ if (d > 0)
{
r /= m_parent.m_totalsizes[d - 1];
}
@@ -98,9 +97,9 @@ private:
{
unsigned int t = 1;
- for(unsigned int i = 0; i < D; i++)
+ for (unsigned int i = 0; i < D; i++)
{
- if(m_sizes[i] == 0)
+ if (m_sizes[i] == 0)
{
m_sizes[i] = 1;
}
@@ -116,14 +115,12 @@ public:
NDRange(const NDRange &rhs) = default;
template <typename... T>
- NDRange(T... ts)
- : m_sizes{ ts... }
+ NDRange(T... ts) : m_sizes{ts...}
{
set_totalsizes();
}
- NDRange(const std::array<unsigned int, D> &n)
- : m_sizes(n)
+ NDRange(const std::array<unsigned int, D> &n) : m_sizes(n)
{
set_totalsizes();
}
@@ -163,7 +160,7 @@ public:
std::array<int_t, N> sizes{};
std::size_t i = 0;
- for(auto &p : list)
+ for (auto &p : list)
{
m_positions[i] = p.first;
sizes[i++] = p.second;