From 5f707736413aeac77818c42838296966f8dc6761 Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Tue, 3 Jul 2018 16:22:02 +0100 Subject: COMPMID-1369: Revert accidental formatting of RSH's repo Pulled latest fixes from David's repo: commit f43ebe932c84083332b0b1a0348241b69dda63a7 Author: David Mansell Date: Tue Jul 3 18:09:01 2018 +0100 Whitespace tidying, fixed comment in gemv_batched imported from ACL. Change-Id: Ie37a623f44e90d88072236cb853ac55ac82d5f51 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/138530 Tested-by: Jenkins Reviewed-by: Georgios Pinitas Reviewed-by: Gian Marco Iodice Reviewed-by: David Mansell Reviewed-by: Anthony Barbier --- .../transforms/transpose_interleave_common.hpp | 218 ++++++++++----------- 1 file changed, 99 insertions(+), 119 deletions(-) (limited to 'src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp') diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp index 3218ca1aac..63e85c155a 100644 --- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp +++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp @@ -24,137 +24,117 @@ #pragma once template -struct TransposeInterleaveCommon -{ - // Override the moveblock_1xY methods to improve performance - static inline void moveblock_1x1(const TIn *&in0, TOut *out) - { - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in0++); - } +struct TransposeInterleaveCommon { + // Override the moveblock_1xY methods to improve performance + static inline void moveblock_1x1(const TIn *&in0, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in0++); } + } - static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) - { - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in0++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in1++); - } + static inline void moveblock_1x2(const TIn *&in0, const TIn *&in1, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in0++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in1++); } + } - static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) - { - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in0++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in1++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in2++); - } - for(unsigned int i = 0; i < IntBy; i++) - { - *out++ = static_cast(*in3++); + static inline void moveblock_1x4(const TIn *&in0, const TIn *&in1, const TIn *&in2, const TIn *&in3, TOut *out) { + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in0++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in1++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in2++); + } + for (unsigned int i = 0; i < IntBy; i++) { + *out++ = static_cast(*in3++); + } + } + + static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) { + const auto ldin = stride; + + TOut *outarray = out; + const TIn *inarray = in; + TOut *outptr_base = outarray; + const TIn *inptr_base = inarray + x0 + (k0 * ldin); + int ldout = (kmax - k0) * IntBy; + + int k=(kmax-k0); + for ( ; k>3; k-=4) { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + const TIn *inptr3 = inptr2 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + prefetch_3x(inptr3); + + outptr_base += IntBy * 4; + inptr_base += ldin * 4; + + for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { + moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); + outptr += ldout; } } - static inline void Transform(TOut *out, const TIn *in, const int stride, const int x0, const int xmax, const int k0, const int kmax) - { - const auto ldin = stride; - - TOut *outarray = out; - const TIn *inarray = in; - TOut *outptr_base = outarray; - const TIn *inptr_base = inarray + x0 + (k0 * ldin); - int ldout = (kmax - k0) * IntBy; - - int k = (kmax - k0); - for(; k > 3; k -= 4) - { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - const TIn *inptr3 = inptr2 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - prefetch_3x(inptr3); - - outptr_base += IntBy * 4; - inptr_base += ldin * 4; - - for(int x = (xmax - x0) / IntBy; x > 0; x--) - { - moveblock_1x4(inptr, inptr1, inptr2, inptr3, outptr); - outptr += ldout; + if (k) { + TOut *outptr = outptr_base; + const TIn *inptr = inptr_base; + const TIn *inptr1 = inptr + ldin; + const TIn *inptr2 = inptr1 + ldin; + + prefetch_3x(inptr); + prefetch_3x(inptr1); + prefetch_3x(inptr2); + + for (int x = (xmax-x0) / IntBy; x > 0 ; x--) { + switch(k) { + case 3: + moveblock_1x2(inptr, inptr1, outptr); + moveblock_1x1(inptr2, outptr + IntBy * 2); + break; + + case 2: + moveblock_1x2(inptr, inptr1, outptr); + break; + + case 1: + moveblock_1x1(inptr, outptr); + break; + + default: + UNREACHABLE("Impossible."); } - } - if(k) - { - TOut *outptr = outptr_base; - const TIn *inptr = inptr_base; - const TIn *inptr1 = inptr + ldin; - const TIn *inptr2 = inptr1 + ldin; - - prefetch_3x(inptr); - prefetch_3x(inptr1); - prefetch_3x(inptr2); - - for(int x = (xmax - x0) / IntBy; x > 0; x--) - { - switch(k) - { - case 3: - moveblock_1x2(inptr, inptr1, outptr); - moveblock_1x1(inptr2, outptr + IntBy * 2); - break; - - case 2: - moveblock_1x2(inptr, inptr1, outptr); - break; - - case 1: - moveblock_1x1(inptr, outptr); - break; - - default: - UNREACHABLE("Impossible."); - } - - outptr += ldout; - } + outptr += ldout; } + } + + // Cope with ragged X cases + const unsigned int overflow = (xmax - x0) % IntBy; + if (overflow) { + const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); + TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; + + for (int k=(kmax-k0); k>0; k--) { + const TIn *inptr = inptr_base; + inptr_base += ldin; - // Cope with ragged X cases - const unsigned int overflow = (xmax - x0) % IntBy; - if(overflow) - { - const TIn *inptr_base = inarray + (xmax - overflow) + (k0 * ldin); - TOut *outptr = outarray + ((xmax - x0) / IntBy) * ldout; - - for(int k = (kmax - k0); k > 0; k--) - { - const TIn *inptr = inptr_base; - inptr_base += ldin; - - for(unsigned int x = 0; x < IntBy; x++) - { - TOut val = (x < overflow) ? static_cast(*inptr++) : static_cast(0); - *outptr++ = val; - } + for (unsigned int x=0; x < IntBy; x++) { + TOut val = (x < overflow) ? static_cast(*inptr++) : static_cast(0); + *outptr++ = val; } } } +} }; -- cgit v1.2.1