aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels/assembly/gemm_common.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels/assembly/gemm_common.hpp')
-rw-r--r--src/cpu/kernels/assembly/gemm_common.hpp74
1 files changed, 51 insertions, 23 deletions
diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 834cd1061e..6fe9f13f02 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp
@@ -25,7 +25,6 @@
#include "convolution_parameters.hpp"
#include "ndrange.hpp"
-
#include <cstddef>
namespace arm_gemm
@@ -51,10 +50,19 @@ public:
* appropriately typed pointers. If B is pretransposed (see below) then
* the settings for B here are ignored.
*/
- virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+ virtual void set_arrays_generic(const void *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const void *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ void *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const void *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride) = 0;
/** @returns an ndrange containing ranges of the compute space which can be
* broken up and parallelised over
@@ -73,7 +81,7 @@ public:
* This has an empty default implementation, as GEMMs which don't care
* about thread count can safely ignore this.
*/
- virtual void set_nthreads(int) {};
+ virtual void set_nthreads(int){};
/* Whether this GEMM can be dynamically scheduled or not. */
virtual bool supports_dynamic_scheduling() const
@@ -95,7 +103,7 @@ public:
return 0;
}
/* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
- virtual void set_working_space(void *) {};
+ virtual void set_working_space(void *){};
/*** "Pretransposed" interface (optional) ***/
/* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */
@@ -122,7 +130,8 @@ public:
/* The "real" version of this depends on the templated operand type (see below). */
virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
/* Threaded version with window start/end parameters */
- virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+ virtual void
+ pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
/* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
virtual void set_pretransposed_B_data(void *)
@@ -186,10 +195,19 @@ protected:
public:
/* Pass in the pointers to the arrays to be operated on and their
* strides (templated version with appropriate types). */
- virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+ virtual void set_arrays(const To *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const To *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ Tr *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const Tr *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride)
{
_Aptr = A;
_lda = lda;
@@ -207,25 +225,33 @@ public:
}
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
- void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+ void set_arrays_generic(const void *A,
+ const int lda,
+ const int A_batch_stride,
+ const int A_multi_stride,
+ const void *B,
+ const int ldb,
+ /* batches share B */ const int B_multi_stride,
+ void *C,
+ const int ldc,
+ const int C_batch_stride,
+ const int C_multi_stride,
+ const void *bias,
+ /* no row or batch stride needed */ const int bias_multi_stride) override
{
- set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
- static_cast<const To *>(B), ldb, B_multi_stride,
- static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+ set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+ B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
static_cast<const Tr *>(bias), bias_multi_stride);
}
/*** "Pretransposed" interface ***/
/* Compute col sums over all columns */
- virtual void requantize_bias(void *, const To *, const int, const int) {};
+ virtual void requantize_bias(void *, const To *, const int, const int){};
/* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
/* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
- virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
+ virtual void pretranspose_B_array(void *, const To *, const int, const int){};
/* Implementation of the void * overload which casts its arguments to the appropriate type. */
void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
@@ -237,12 +263,14 @@ public:
* The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
* just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only
* legal values for start and end are 0 and 1 respectively. */
- virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+ virtual void
+ pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
{
pretranspose_B_array(out, in, row_stride, multi_stride);
};
- void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+ void pretranspose_B_array_part_generic(
+ void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
{
pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
}