From bdcdc39d89b6a6556f5c0483af5379f75eae0c55 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 22 Apr 2021 16:42:03 +0100 Subject: Enable fat binary support Changes our build system to allow building both Neon(TM) and SVE kernels and package them in the same binary. This will allow runtime selection of the underlying architecture. Adds new build option, fat_binary, for enabling this feature. Change-Id: I8e8386149773ce28e071a2fb7ddd8c8ae0f28a4a Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5704 Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- Android.bp | 8 +- SConscript | 171 ++++----- SConstruct | 13 + filelist.json | 288 +++++++++++++++ scripts/clang_tidy_rules.py | 2 + src/core/NEON/SVEMath.h | 4 +- src/core/NEON/SVEMath.inl | 4 +- .../kernels/NEBatchNormalizationLayerKernel.cpp | 7 +- .../kernels/batchnormalization/impl/SVE/fp16.cpp | 4 +- .../kernels/batchnormalization/impl/SVE/fp32.cpp | 4 +- src/core/NEON/wrapper/intrinsics/svpow.h | 10 +- src/core/NEON/wrapper/svtraits.h | 4 +- src/core/NEON/wrapper/traits.h | 8 +- src/core/common/Registrars.h | 70 +++- src/core/cpu/kernels/CpuActivationKernel.cpp | 12 +- src/core/cpu/kernels/CpuAddKernel.cpp | 10 +- src/core/cpu/kernels/CpuElementwiseKernel.cpp | 63 ++-- src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp | 6 +- src/core/cpu/kernels/CpuScaleKernel.cpp | 19 +- src/core/cpu/kernels/CpuSoftmaxKernel.cpp | 19 +- src/core/cpu/kernels/activation/sve/fp16.cpp | 4 +- src/core/cpu/kernels/activation/sve/fp32.cpp | 4 +- src/core/cpu/kernels/add/sve/impl.cpp | 137 ++++++++ src/core/cpu/kernels/add/sve/impl.h | 40 +++ src/core/cpu/kernels/add/sve/integer.cpp | 6 +- src/core/cpu/kernels/add/sve/list.h | 97 +----- .../cpu/kernels/elementwise/sve/elementwise.cpp | 309 ++++++++++++++++ .../cpu/kernels/elementwise/sve/elementwise_list.h | 265 ++------------ .../elementwise/sve/elementwise_quantized_list.h | 19 +- .../kernels/elementwise/sve/elementwise_unary.cpp | 111 ++++++ .../elementwise/sve/elementwise_unary_list.h | 78 +---- src/core/cpu/kernels/floor/NEON/fp16.cpp | 64 ---- src/core/cpu/kernels/floor/NEON/fp32.cpp | 61 ---- src/core/cpu/kernels/floor/neon/fp16.cpp | 64 ++++ src/core/cpu/kernels/floor/neon/fp32.cpp | 61 ++++ src/core/cpu/kernels/scale/sve/fp16.cpp | 8 +- src/core/cpu/kernels/scale/sve/fp32.cpp | 4 +- src/core/cpu/kernels/scale/sve/integer.cpp | 7 +- src/core/cpu/kernels/scale/sve/qasymm8.cpp | 11 +- src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp | 11 +- src/core/cpu/kernels/softmax/impl/NEON/list.h | 388 --------------------- src/core/cpu/kernels/softmax/impl/SVE/list.h | 353 ------------------- src/core/cpu/kernels/softmax/impl/neon/list.h | 388 +++++++++++++++++++++ src/core/cpu/kernels/softmax/impl/sve/impl.cpp | 185 ++++++++++ src/core/cpu/kernels/softmax/impl/sve/list.h | 223 ++++++++++++ tests/validation/NEON/ActivationLayer.cpp | 24 +- tests/validation/NEON/ArithmeticAddition.cpp | 6 +- 47 files changed, 2120 insertions(+), 1534 deletions(-) create mode 100644 filelist.json create mode 100644 src/core/cpu/kernels/add/sve/impl.cpp create mode 100644 src/core/cpu/kernels/add/sve/impl.h create mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise.cpp create mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp delete mode 100644 src/core/cpu/kernels/floor/NEON/fp16.cpp delete mode 100644 src/core/cpu/kernels/floor/NEON/fp32.cpp create mode 100644 src/core/cpu/kernels/floor/neon/fp16.cpp create mode 100644 src/core/cpu/kernels/floor/neon/fp32.cpp delete mode 100644 src/core/cpu/kernels/softmax/impl/NEON/list.h delete mode 100644 src/core/cpu/kernels/softmax/impl/SVE/list.h create mode 100644 src/core/cpu/kernels/softmax/impl/neon/list.h create mode 100644 src/core/cpu/kernels/softmax/impl/sve/impl.cpp create mode 100644 src/core/cpu/kernels/softmax/impl/sve/list.h diff --git a/Android.bp b/Android.bp index d1003f2d7d..cbb58b1ce9 100644 --- a/Android.bp +++ b/Android.bp @@ -314,12 +314,15 @@ cc_library_static { "src/core/cpu/kernels/add/neon/qasymm8.cpp", "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp", "src/core/cpu/kernels/add/neon/qsymm16.cpp", + "src/core/cpu/kernels/add/sve/impl.cpp", "src/core/cpu/kernels/add/sve/integer.cpp", "src/core/cpu/kernels/add/sve/qasymm8.cpp", "src/core/cpu/kernels/add/sve/qasymm8_signed.cpp", "src/core/cpu/kernels/add/sve/qsymm16.cpp", - "src/core/cpu/kernels/floor/NEON/fp16.cpp", - "src/core/cpu/kernels/floor/NEON/fp32.cpp", + "src/core/cpu/kernels/elementwise/sve/elementwise.cpp", + "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp", + "src/core/cpu/kernels/floor/neon/fp16.cpp", + "src/core/cpu/kernels/floor/neon/fp32.cpp", "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp", "src/core/cpu/kernels/pooling/neon/fp16.cpp", "src/core/cpu/kernels/pooling/neon/fp32.cpp", @@ -335,6 +338,7 @@ cc_library_static { "src/core/cpu/kernels/scale/sve/integer.cpp", "src/core/cpu/kernels/scale/sve/qasymm8.cpp", "src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp", + "src/core/cpu/kernels/softmax/impl/sve/impl.cpp", "src/core/cpu/kernels/sub/neon/integer.cpp", "src/core/cpu/kernels/sub/neon/qasymm8.cpp", "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp", diff --git a/SConscript b/SConscript index 143823d013..94ba6d423f 100644 --- a/SConscript +++ b/SConscript @@ -26,6 +26,7 @@ import subprocess import zlib import base64 import string +import json VERSION = "v0.0-unreleased" LIBRARY_VERSION_MAJOR = 23 @@ -38,13 +39,20 @@ Import('vars') Import('install_lib') def build_bootcode_objs(sources): - arm_compute_env.Append(ASFLAGS = "-I bootcode/") obj = arm_compute_env.Object(sources) obj = install_lib(obj) Default(obj) return obj +def build_sve_objs(sources): + tmp_env = arm_compute_env.Clone() + tmp_env.Append(CXXFLAGS = "-march=armv8.2-a+sve+fp16") + obj = tmp_env.SharedObject(sources) + obj = install_lib(obj) + Default(obj) + return obj + def build_library(name, build_env, sources, static=False, libs=[]): if static: obj = build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs) @@ -172,6 +180,9 @@ arm_compute_env.Append(CPPPATH =[Dir("./src/core/").path] ) arm_compute_env.Append(LIBS = ['dl']) +with (open(Dir('#').path + '/filelist.json')) as fp: + filelist = json.load(fp) + core_files = Glob('src/core/*.cpp') core_files += Glob('src/core/CPP/*.cpp') core_files += Glob('src/core/CPP/kernels/*.cpp') @@ -189,25 +200,14 @@ runtime_files += Glob('src/runtime/CPP/ICPPSimpleFunction.cpp') runtime_files += Glob('src/runtime/CPP/functions/*.cpp') # C API files -c_api_files = ['src/c/AclContext.cpp', - 'src/c/AclQueue.cpp', - 'src/c/AclTensor.cpp', - 'src/c/AclTensorPack.cpp', - 'src/c/AclVersion.cpp', - ] +runtime_files += filelist['c_api']['cpu'] + if env['opencl']: - c_api_files += ['src/c/cl/AclOpenClExt.cpp'] + runtime_files += filelist['c_api']['gpu'] # Common backend files -common_backend_files = ['src/common/utils/LegacySupport.cpp', - 'src/common/AllocatorWrapper.cpp', - 'src/common/ITensorV2.cpp', - 'src/common/TensorPack.cpp', - ] - -core_files += common_backend_files -runtime_files += c_api_files -# CLHarrisCorners uses the Scheduler to run CPP kernels +core_files += filelist['common'] + runtime_files += Glob('src/runtime/CPP/SingleThreadScheduler.cpp') graph_files = Glob('src/graph/*.cpp') @@ -231,9 +231,7 @@ if env['opencl']: ] core_files += cl_kernel_hp_files core_files += Glob('src/core/CL/*.cpp') - core_files += Glob('src/core/CL/kernels/*.cpp') core_files += Glob('src/core/gpu/cl/*.cpp') - core_files += Glob('src/core/gpu/cl/kernels/*.cpp') runtime_files += Glob('src/runtime/CL/*.cpp') runtime_files += Glob('src/runtime/CL/functions/*.cpp') @@ -245,10 +243,12 @@ if env['opencl']: runtime_files += Glob('src/runtime/CL/gemm_auto_heuristics/*.cpp') runtime_files += Glob('src/gpu/cl/*.cpp') - graph_files += Glob('src/graph/backends/CL/*.cpp') + core_files += filelist['gpu']['core']['kernels']['high_priority'] + filelist['gpu']['core']['kernels']['all'] +sve_o = [] +core_files_sve = [] if env['neon']: core_files += Glob('src/core/NEON/*.cpp') core_files += Glob('src/core/NEON/kernels/*.cpp') @@ -277,107 +277,47 @@ if env['neon']: if env['estate'] == '64': core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp') core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/a64_*/*.cpp') - if "sve" in env['arch']: - core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp') - core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp') + if "sve" in env['arch'] or env['fat_binary']: + core_files_sve += filelist['cpu']['core']['sve']['all'] + core_files_sve += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp') + core_files_sve += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp') + + if any(i in env['data_layout_support'] for i in ['all', 'nchw']): + core_files += filelist['cpu']['core']['neon']['nchw'] if any(i in env['data_type_support'] for i in ['all', 'fp16']): - core_files += Glob('src/core/NEON/kernels/*/impl/*/fp16.cpp') + if not "sve" in env['arch'] or env['fat_binary']: + core_files += filelist['cpu']['core']['neon']['fp16'] + if "sve" in env['arch'] or env['fat_binary']: + core_files_sve += filelist['cpu']['core']['sve']['fp16'] if any(i in env['data_type_support'] for i in ['all', 'fp32']): - core_files += Glob('src/core/NEON/kernels/*/impl/*/fp32.cpp') + if not "sve" in env['arch'] or env['fat_binary']: + core_files += filelist['cpu']['core']['neon']['fp32'] + if "sve" in env['arch'] or env['fat_binary']: + core_files_sve += filelist['cpu']['core']['sve']['fp32'] if any(i in env['data_type_support'] for i in ['all', 'qasymm8']): - core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8.cpp') + core_files += filelist['cpu']['core']['neon']['qasymm8'] + core_files_sve += filelist['cpu']['core']['sve']['qasymm8'] if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']): - core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8_signed.cpp') + core_files += filelist['cpu']['core']['neon']['qasymm8_signed'] + core_files_sve += filelist['cpu']['core']['sve']['qasymm8_signed'] if any(i in env['data_type_support'] for i in ['all', 'qsymm16']): - core_files += Glob('src/core/NEON/kernels/*/impl/*/qsymm16.cpp') + core_files += filelist['cpu']['core']['neon']['qsymm16'] + core_files_sve += filelist['cpu']['core']['sve']['qsymm16'] if any(i in env['data_type_support'] for i in ['all', 'integer']): - core_files += Glob('src/core/NEON/kernels/*/impl/*/integer.cpp') + if not "sve" in env['arch'] or env['fat_binary']: + core_files += filelist['cpu']['core']['neon']['integer'] + if "sve" in env['arch'] or env['fat_binary']: + core_files_sve += filelist['cpu']['core']['sve']['integer'] + + core_files += Glob('src/core/cpu/kernels/*/*.cpp') + core_files += filelist['cpu']['core']['kernels']['high_priority'] + filelist['cpu']['core']['kernels']['all'] runtime_files += Glob('src/runtime/NEON/*.cpp') runtime_files += Glob('src/runtime/NEON/functions/*.cpp') runtime_files += Glob('src/runtime/NEON/functions/assembly/*.cpp') - - cpu_kernel_hp_files = ['src/core/cpu/kernels/CpuActivationKernel.cpp', - 'src/core/cpu/kernels/CpuCastKernel.cpp', - 'src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp', - 'src/core/cpu/kernels/CpuDirectConv2dKernel.cpp', - 'src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp', - 'src/core/cpu/kernels/CpuPermuteKernel.cpp', - 'src/core/cpu/kernels/CpuPool2dKernel.cpp', - 'src/core/cpu/kernels/CpuReshapeKernel.cpp', - ] - cpu_kernel_files = ['src/core/cpu/kernels/CpuAddKernel.cpp', - 'src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp', - 'src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp', - 'src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp', - 'src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp', - 'src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp', - 'src/core/cpu/kernels/CpuCopyKernel.cpp', - 'src/core/cpu/kernels/CpuDequantizeKernel.cpp', - 'src/core/cpu/kernels/CpuElementwiseKernel.cpp', - 'src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp', - 'src/core/cpu/kernels/CpuFillKernel.cpp', - 'src/core/cpu/kernels/CpuFloorKernel.cpp', - 'src/core/cpu/kernels/CpuMulKernel.cpp', - 'src/core/cpu/kernels/CpuQuantizeKernel.cpp', - 'src/core/cpu/kernels/CpuScaleKernel.cpp', - 'src/core/cpu/kernels/CpuSoftmaxKernel.cpp', - 'src/core/cpu/kernels/CpuSubKernel.cpp', - 'src/core/cpu/kernels/CpuTransposeKernel.cpp', - ] - core_files += [cpu_kernel_hp_files, cpu_kernel_files] - - core_files += Glob('src/core/cpu/kernels/*/*.cpp') - if any(i in env['data_type_support'] for i in ['all', 'fp16']): - core_files += Glob('src/core/cpu/kernels/*/*/fp16.cpp') - if any(i in env['data_type_support'] for i in ['all', 'fp32']): - core_files += Glob('src/core/cpu/kernels/*/*/fp32.cpp') - if any(i in env['data_type_support'] for i in ['all', 'qasymm8']): - core_files += Glob('src/core/cpu/kernels/*/*/qasymm8.cpp') - if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']): - core_files += Glob('src/core/cpu/kernels/*/*/qasymm8_signed.cpp') - if any(i in env['data_type_support'] for i in ['all', 'qsymm16']): - core_files += Glob('src/core/cpu/kernels/*/*/qsymm16.cpp') - if any(i in env['data_type_support'] for i in ['all', 'integer']): - core_files += Glob('src/core/cpu/kernels/*/*/integer.cpp') - - if any(i in env['data_layout_support'] for i in ['all', 'nchw']): - core_files += Glob('src/core/cpu/kernels/*/*/nchw/all.cpp') - - cpu_rt_files = ['src/cpu/CpuContext.cpp', - 'src/cpu/CpuQueue.cpp', - 'src/cpu/CpuTensor.cpp' - ] - cpu_operator_hp_files = ['src/runtime/cpu/operators/CpuActivation.cpp', - 'src/runtime/cpu/operators/CpuCast.cpp', - 'src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp', - 'src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp', - 'src/runtime/cpu/operators/CpuDirectConv2d.cpp', - 'src/runtime/cpu/operators/CpuFlatten.cpp', - 'src/runtime/cpu/operators/CpuPermute.cpp', - 'src/runtime/cpu/operators/CpuPool2d.cpp', - ] - cpu_operator_files = ['src/runtime/cpu/operators/CpuAdd.cpp', - 'src/runtime/cpu/operators/CpuConcatenate.cpp', - 'src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp', - 'src/runtime/cpu/operators/CpuCopy.cpp', - 'src/runtime/cpu/operators/CpuDequantize.cpp', - 'src/runtime/cpu/operators/CpuElementwise.cpp', - 'src/runtime/cpu/operators/CpuElementwiseUnary.cpp', - 'src/runtime/cpu/operators/CpuFill.cpp', - 'src/runtime/cpu/operators/CpuFloor.cpp', - 'src/runtime/cpu/operators/CpuMul.cpp', - 'src/runtime/cpu/operators/CpuQuantize.cpp', - 'src/runtime/cpu/operators/CpuReshape.cpp', - 'src/runtime/cpu/operators/CpuScale.cpp', - 'src/runtime/cpu/operators/CpuSoftmax.cpp', - 'src/runtime/cpu/operators/CpuSub.cpp', - 'src/runtime/cpu/operators/CpuTranspose.cpp', - 'src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp', - ] - cpu_internal_operator_files = ['src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp',] - runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files, cpu_internal_operator_files ] + runtime_files += filelist['cpu']['runtime']['all'] + filelist['cpu']['runtime']['operators']['high_priority'] \ + + filelist['cpu']['runtime']['operators']['all'] + filelist['cpu']['runtime']['operators']['internal'] bootcode_o = [] if env['os'] == 'bare_metal': @@ -385,11 +325,18 @@ if env['os'] == 'bare_metal': bootcode_o = build_bootcode_objs(bootcode_files) Export('bootcode_o') -arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files, static=True) +if (env['fat_binary']): + sve_o = build_sve_objs(core_files_sve) + arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files + sve_o, static=True) +else: + arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files + core_files_sve, static=True) Export('arm_compute_core_a') if env['os'] != 'bare_metal' and not env['standalone']: - arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files, static=False) + if (env['fat_binary']): + arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files + sve_o, static=False) + else: + arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files + core_files_sve, static=False) Export('arm_compute_core_so') arm_compute_a = build_library('arm_compute-static', arm_compute_env, runtime_files, static=True, libs = [ arm_compute_core_a ]) diff --git a/SConstruct b/SConstruct index d36fbab275..f800d9d105 100644 --- a/SConstruct +++ b/SConstruct @@ -53,6 +53,7 @@ vars.AddVariables( BoolVariable("examples", "Build example programs", True), BoolVariable("gemm_tuner", "Build gemm_tuner programs", True), BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True), + BoolVariable("fat_binary", "Build fat binary version of library. Note works only for armv8.2-a", False), BoolVariable("standalone", "Builds the tests as standalone executables, links statically with libgcc, libstdc++ and libarm_compute", False), BoolVariable("opencl", "Enable OpenCL support", True), BoolVariable("neon", "Enable Arm® Neon™ support", False), @@ -255,6 +256,11 @@ if 'x86' not in env['arch']: elif env['os'] == 'tizen': prefix = "aarch64-tizen-linux-gnu-" +if 'sve' in env['arch']: + env.Append(CXXFLAGS = ['-DENABLE_SVE']) +else: + env.Append(CXXFLAGS = ['-DENABLE_NEON']) + if env['build'] == 'native': prefix = "" @@ -298,6 +304,13 @@ if not GetOption("help"): if not version_at_least(compiler_ver, '7.0.0') and env['os'] == 'bare_metal': env.Append(LINKFLAGS = ['-fstack-protector-strong']) +if env['fat_binary']: + if env['arch'] != 'armv8.2-a': + print("Currently fat binary is only supported with armv8.2-a") + Exit(1) + env.Append(CXXFLAGS = ['-DENABLE_SVE']) + env.Append(CXXFLAGS = ['-DENABLE_NEON']) + if env['data_type_support']: if any(i in env['data_type_support'] for i in ['all', 'fp16']): env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS']) diff --git a/filelist.json b/filelist.json new file mode 100644 index 0000000000..d84a350a82 --- /dev/null +++ b/filelist.json @@ -0,0 +1,288 @@ +{ + "common" : [ + "src/common/utils/LegacySupport.cpp", + "src/common/AllocatorWrapper.cpp", + "src/common/ITensorV2.cpp", + "src/common/TensorPack.cpp" + ], + "c_api" : + { + "cpu": [ + "src/c/AclContext.cpp", + "src/c/AclQueue.cpp", + "src/c/AclTensor.cpp", + "src/c/AclTensorPack.cpp", + "src/c/AclVersion.cpp" + ], + "gpu": [ + "src/c/cl/AclOpenClExt.cpp" + ] + }, + + "gpu" : + { + "core" : + { + "kernels" : + { + "high_priority" : [ + "src/core/gpu/cl/kernels/ClActivationKernel.cpp", + "src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp", + "src/core/gpu/cl/kernels/ClPermuteKernel.cpp", + "src/core/gpu/cl/kernels/ClPool2dKernel.cpp", + "src/core/gpu/cl/kernels/ClReshapeKernel.cpp" + ], + "all" : [ + "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp", + "src/core/gpu/cl/kernels/ClCastKernel.cpp", + "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp", + "src/core/gpu/cl/kernels/ClCopyKernel.cpp", + "src/core/gpu/cl/kernels/ClCropKernel.cpp", + "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp", + "src/core/gpu/cl/kernels/ClDequantizeKernel.cpp", + "src/core/gpu/cl/kernels/ClElementwiseKernel.cpp", + "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp", + "src/core/gpu/cl/kernels/ClFillKernel.cpp", + "src/core/gpu/cl/kernels/ClFloorKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp", + "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp", + "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp", + "src/core/gpu/cl/kernels/ClMulKernel.cpp", + "src/core/gpu/cl/kernels/ClQuantizeKernel.cpp", + "src/core/gpu/cl/kernels/ClScaleKernel.cpp", + "src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp", + "src/core/gpu/cl/kernels/ClTransposeKernel.cpp", + "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp", + "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp", + "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp", + "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp", + "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp", + "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp", + "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp", + "src/core/CL/kernels/CLBitwiseKernel.cpp", + "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp", + "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp", + "src/core/CL/kernels/CLCol2ImKernel.cpp", + "src/core/CL/kernels/CLComparisonKernel.cpp", + "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp", + "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp", + "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp", + "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp", + "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp", + "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp", + "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp", + "src/core/CL/kernels/CLFFTRadixStageKernel.cpp", + "src/core/CL/kernels/CLFFTScaleKernel.cpp", + "src/core/CL/kernels/CLFillBorderKernel.cpp", + "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp", + "src/core/CL/kernels/CLGatherKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp", + "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp", + "src/core/CL/kernels/CLIm2ColKernel.cpp", + "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp", + "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp", + "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp", + "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp", + "src/core/CL/kernels/CLMinMaxLayerKernel.cpp", + "src/core/CL/kernels/CLNormalizationLayerKernel.cpp", + "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp", + "src/core/CL/kernels/CLPadLayerKernel.cpp", + "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp", + "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp", + "src/core/CL/kernels/CLRangeKernel.cpp", + "src/core/CL/kernels/CLReductionOperationKernel.cpp", + "src/core/CL/kernels/CLRemapKernel.cpp", + "src/core/CL/kernels/CLReorgLayerKernel.cpp", + "src/core/CL/kernels/CLReverseKernel.cpp", + "src/core/CL/kernels/CLROIAlignLayerKernel.cpp", + "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp", + "src/core/CL/kernels/CLSelectKernel.cpp", + "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp", + "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp", + "src/core/CL/kernels/CLStackLayerKernel.cpp", + "src/core/CL/kernels/CLStridedSliceKernel.cpp", + "src/core/CL/kernels/CLTileKernel.cpp", + "src/core/CL/kernels/CLWeightsReshapeKernel.cpp", + "src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp", + "src/core/CL/kernels/CLWinogradInputTransformKernel.cpp", + "src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp" + ] + } + } + }, + "cpu" : + { + "runtime" : + { + "all" : [ + "src/cpu/CpuContext.cpp", + "src/cpu/CpuQueue.cpp", + "src/cpu/CpuTensor.cpp" + ], + "operators" : + { + "high_priority" : [ + "src/runtime/cpu/operators/CpuActivation.cpp", + "src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp", + "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp", + "src/runtime/cpu/operators/CpuDirectConv2d.cpp", + "src/runtime/cpu/operators/CpuPermute.cpp", + "src/runtime/cpu/operators/CpuPool2d.cpp" + ], + "internal" : [ + "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp" + ], + "all" : [ + "src/runtime/cpu/operators/CpuAdd.cpp", + "src/runtime/cpu/operators/CpuCast.cpp", + "src/runtime/cpu/operators/CpuConcatenate.cpp", + "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp", + "src/runtime/cpu/operators/CpuCopy.cpp", + "src/runtime/cpu/operators/CpuDequantize.cpp", + "src/runtime/cpu/operators/CpuElementwise.cpp", + "src/runtime/cpu/operators/CpuElementwiseUnary.cpp", + "src/runtime/cpu/operators/CpuFill.cpp", + "src/runtime/cpu/operators/CpuFlatten.cpp", + "src/runtime/cpu/operators/CpuFloor.cpp", + "src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp", + "src/runtime/cpu/operators/CpuMul.cpp", + "src/runtime/cpu/operators/CpuQuantize.cpp", + "src/runtime/cpu/operators/CpuReshape.cpp", + "src/runtime/cpu/operators/CpuScale.cpp", + "src/runtime/cpu/operators/CpuSoftmax.cpp", + "src/runtime/cpu/operators/CpuSub.cpp", + "src/runtime/cpu/operators/CpuTranspose.cpp" + ] + } + }, + "core" : + { + "kernels" : + { + "high_priority" : [ + "src/core/cpu/kernels/CpuActivationKernel.cpp", + "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp", + "src/core/cpu/kernels/CpuDirectConv2dKernel.cpp", + "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp", + "src/core/cpu/kernels/CpuPermuteKernel.cpp", + "src/core/cpu/kernels/CpuPool2dKernel.cpp", + "src/core/cpu/kernels/CpuReshapeKernel.cpp" + ], + "all" : [ + "src/core/cpu/kernels/CpuAddKernel.cpp", + "src/core/cpu/kernels/CpuCastKernel.cpp", + "src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp", + "src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp", + "src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp", + "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp", + "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp", + "src/core/cpu/kernels/CpuCopyKernel.cpp", + "src/core/cpu/kernels/CpuDequantizeKernel.cpp", + "src/core/cpu/kernels/CpuElementwiseKernel.cpp", + "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp", + "src/core/cpu/kernels/CpuFillKernel.cpp", + "src/core/cpu/kernels/CpuFloorKernel.cpp", + "src/core/cpu/kernels/CpuMulKernel.cpp", + "src/core/cpu/kernels/CpuQuantizeKernel.cpp", + "src/core/cpu/kernels/CpuScaleKernel.cpp", + "src/core/cpu/kernels/CpuSoftmaxKernel.cpp", + "src/core/cpu/kernels/CpuSubKernel.cpp", + "src/core/cpu/kernels/CpuTransposeKernel.cpp" + ] + }, + + "sve" : + { + "all" : [ + "src/core/cpu/kernels/add/sve/impl.cpp", + "src/core/cpu/kernels/softmax/impl/sve/impl.cpp", + "src/core/cpu/kernels/elementwise/sve/elementwise.cpp", + "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp" + ], + "fp32" : [ + "src/core/cpu/kernels/activation/sve/fp32.cpp", + "src/core/cpu/kernels/scale/sve/fp32.cpp", + "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp" + ], + "fp16" : [ + "src/core/cpu/kernels/activation/sve/fp16.cpp", + "src/core/cpu/kernels/scale/sve/fp16.cpp", + "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp" + ], + "qsymm16" : [ + "src/core/cpu/kernels/activation/sve/qsymm16.cpp", + "src/core/cpu/kernels/add/sve/qsymm16.cpp" + ], + "qasymm8" : [ + "src/core/cpu/kernels/activation/sve/qasymm8.cpp", + "src/core/cpu/kernels/add/sve/qasymm8.cpp", + "src/core/cpu/kernels/scale/sve/qasymm8.cpp" + ], + "qasymm8_signed" : [ + "src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp", + "src/core/cpu/kernels/add/sve/qasymm8_signed.cpp", + "src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp" + ], + "integer" : [ + "src/core/cpu/kernels/add/sve/integer.cpp", + "src/core/cpu/kernels/scale/sve/integer.cpp" + ] + }, + + "neon": + { + "nchw" : [ + "src/core/cpu/kernels/pooling/neon/nchw/all.cpp" + ], + "fp32" : [ + "src/core/cpu/kernels/activation/neon/fp32.cpp", + "src/core/cpu/kernels/floor/neon/fp32.cpp", + "src/core/cpu/kernels/pooling/neon/fp32.cpp", + "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp" + ], + "fp16" : [ + "src/core/cpu/kernels/activation/neon/fp16.cpp", + "src/core/cpu/kernels/floor/neon/fp16.cpp", + "src/core/cpu/kernels/pooling/neon/fp16.cpp", + "src/core/cpu/kernels/scale/neon/fp16.cpp", + "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp" + ], + "qsymm16" : [ + "src/core/cpu/kernels/activation/neon/qsymm16.cpp", + "src/core/cpu/kernels/add/neon/qsymm16.cpp", + "src/core/cpu/kernels/sub/neon/qsymm16.cpp" + + ], + "qasymm8" : [ + "src/core/cpu/kernels/activation/neon/qasymm8.cpp", + "src/core/cpu/kernels/add/neon/qasymm8.cpp", + "src/core/cpu/kernels/pooling/neon/qasymm8.cpp", + "src/core/cpu/kernels/scale/neon/qasymm8.cpp", + "src/core/cpu/kernels/sub/neon/qasymm8.cpp" + ], + "qasymm8_signed" : [ + "src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp", + "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp", + "src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp", + "src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp", + "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp" + ], + "integer" : [ + "src/core/cpu/kernels/sub/neon/integer.cpp", + "src/core/cpu/kernels/add/neon/integer.cpp" + ] + } + } + } +} \ No newline at end of file diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py index 649d9343bb..8ab7c13a7c 100755 --- a/scripts/clang_tidy_rules.py +++ b/scripts/clang_tidy_rules.py @@ -63,6 +63,7 @@ def filter_clang_tidy_lines( lines ): ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or ("CPUUtils.cpp" in line and "'asm/hwcap.h' file not found" in line) or ("CPUUtils.cpp" in line and "use of undeclared identifier 'HWCAP_SVE'" in line) or + ("sve" in line) or ("'arm_compute_version.embed' file not found" in line) ): print_context=False continue @@ -115,6 +116,7 @@ def filter_clang_tidy_lines( lines ): ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or + ("sve" in line) or ("CpuDepthwiseConv2dNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy print_context=False continue diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h index b73043a435..dde75e8088 100644 --- a/src/core/NEON/SVEMath.h +++ b/src/core/NEON/SVEMath.h @@ -24,7 +24,7 @@ #ifndef ARM_COMPUTE_SVEMATH_H #define ARM_COMPUTE_SVEMATH_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "src/core/NEON/wrapper/intrinsics/svcvt.h" #include "src/core/NEON/wrapper/intrinsics/svdup_n.h" #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h" @@ -185,5 +185,5 @@ int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in } // namespace arm_compute #include "src/core/NEON/SVEMath.inl" -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* ARM_COMPUTE_SVEMATH_H */ \ No newline at end of file diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl index d909adfeb5..7625e5be34 100644 --- a/src/core/NEON/SVEMath.inl +++ b/src/core/NEON/SVEMath.inl @@ -24,7 +24,7 @@ #include #include -#if defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_SVE) && defined(ENABLE_SVE) #ifndef M_PI #define M_PI (3.14159265358979323846) @@ -388,4 +388,4 @@ inline svint8_t convert_float_to_int(const svfloat32_t &in_0, const sv #endif /* defined(__ARM_FEATURE_SVE2) */ } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 1691943b07..92000bb2f6 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -63,7 +63,7 @@ struct BatchNormalizationKernel static const BatchNormalizationKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp16_sve_batch_normalization", [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; }, @@ -74,7 +74,8 @@ static const BatchNormalizationKernel available_kernels[] = [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* !defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "fp16_neon_batch_normalization", @@ -87,7 +88,7 @@ static const BatchNormalizationKernel available_kernels[] = [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization) }, -#endif /* !defined(__ARM_FEATURE_SVE) */ +#endif /* !defined(ENABLE_NEON) */ }; const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data) diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp index 3e3e81d044..a715b9d3ee 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp @@ -29,7 +29,7 @@ #include #include -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include namespace arm_compute @@ -114,4 +114,4 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea } } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE +#endif // ENABLE_SVE diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp index b0d4cbb684..7cc570d8aa 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp @@ -29,7 +29,7 @@ #include #include -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include namespace arm_compute @@ -114,4 +114,4 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea } } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE +#endif // ENABLE_SVE diff --git a/src/core/NEON/wrapper/intrinsics/svpow.h b/src/core/NEON/wrapper/intrinsics/svpow.h index e89a4ab8f6..0f58d758cb 100644 --- a/src/core/NEON/wrapper/intrinsics/svpow.h +++ b/src/core/NEON/wrapper/intrinsics/svpow.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,8 +35,16 @@ namespace wrapper return svpow_##postfix##_z(pg, a, b); \ } +#define SVPOW_Z_IMPL_INT(type, postfix) \ + inline type svpow_z(svbool_t pg, const type &a, const type &b) \ + { \ + ARM_COMPUTE_UNUSED(pg, a, b); \ + ARM_COMPUTE_ERROR("Not supported"); \ + } + SVPOW_Z_IMPL(svfloat32_t, f32) SVPOW_Z_IMPL(svfloat16_t, f16) +SVPOW_Z_IMPL_INT(svint16_t, s16) #undef SVPOW_Z_IMPL diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h index 465983d16f..8d2d660659 100644 --- a/src/core/NEON/wrapper/svtraits.h +++ b/src/core/NEON/wrapper/svtraits.h @@ -23,7 +23,7 @@ */ #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "src/core/NEON/SVEMath.h" #include @@ -66,5 +66,5 @@ DEFINE_TYPES(bfloat16_t) } // namespace wrapper } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H */ diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h index 3452b76761..81685140f1 100644 --- a/src/core/NEON/wrapper/traits.h +++ b/src/core/NEON/wrapper/traits.h @@ -26,9 +26,9 @@ #include -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ namespace arm_compute { @@ -116,13 +116,13 @@ template <> struct neon_bitvector{ using type = float #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) /** Create the appropriate SVE vector given its type */ template struct sve_vector; template <> struct sve_vector{ using scalar_type = uint8_t; using type = svuint8_t; }; template <> struct sve_vector{ using scalar_type = int8_t; using type = svint8_t; }; -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ #endif /* DOXYGEN_SKIP_THIS */ diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h index 112c83ad94..44ddf9808d 100644 --- a/src/core/common/Registrars.h +++ b/src/core/common/Registrars.h @@ -26,17 +26,17 @@ #if defined(ENABLE_FP16_KERNELS) -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #define REGISTER_FP16_SVE(func_name) &(func_name) -#else /* !defined(__ARM_FEATURE_SVE) */ +#else /* !defined(ENABLE_SVE) */ #define REGISTER_FP16_SVE(func_name) nullptr -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #define REGISTER_FP16_NEON(func_name) &(func_name) -#else /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#else /* !defined(ENABLE_NEON) */ #define REGISTER_FP16_NEON(func_name) nullptr -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ #else /* !defined(ENABLE_FP16_KERNELS) */ #define REGISTER_FP16_NEON(func_name) nullptr @@ -44,50 +44,82 @@ #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ #if defined(ENABLE_FP32_KERNELS) -#if defined(__ARM_FEATURE_SVE) + +#if defined(ENABLE_SVE) #define REGISTER_FP32_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_FP32_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + +#if defined(ENABLE_NEON) #define REGISTER_FP32_NEON(func_name) &(func_name) +#else /* !defined(ENABLE_NEON) */ +#define REGISTER_FP32_NEON(func_name) nullptr +#endif /* defined(ENABLE_NEON) */ + #else /* defined(ENABLE_FP32_KERNELS) */ #define REGISTER_FP32_NEON(func_name) nullptr #define REGISTER_FP32_SVE(func_name) nullptr #endif /* defined(ENABLE_FP32_KERNELS) */ #if defined(ENABLE_QASYMM8_SIGNED_KERNELS) -#if defined(__ARM_FEATURE_SVE) -#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ + #define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name) + +#if defined(ENABLE_SVE) +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name) +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */ #if defined(ENABLE_QASYMM8_KERNELS) -#if defined(__ARM_FEATURE_SVE) -#define REGISTER_QASYMM8_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ #define REGISTER_QASYMM8_NEON(func_name) &(func_name) + +#if defined(ENABLE_SVE) +#define REGISTER_QASYMM8_SVE(func_name) &(func_name) +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_QASYMM8_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + #else /* defined(ENABLE_QASYMM8_KERNELS) */ #define REGISTER_QASYMM8_NEON(func_name) nullptr #define REGISTER_QASYMM8_SVE(func_name) nullptr #endif /* defined(ENABLE_QASYMM8_KERNELS) */ #if defined(ENABLE_QSYMM16_KERNELS) -#if defined(__ARM_FEATURE_SVE) -#define REGISTER_QSYMM16_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ + #define REGISTER_QSYMM16_NEON(func_name) &(func_name) + +#if defined(ENABLE_SVE) +#define REGISTER_QSYMM16_SVE(func_name) &(func_name) +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_QSYMM16_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + #else /* defined(ENABLE_QSYMM16_KERNELS) */ #define REGISTER_QSYMM16_NEON(func_name) nullptr #define REGISTER_QSYMM16_SVE(func_name) nullptr #endif /* defined(ENABLE_QSYMM16_KERNELS) */ #if defined(ENABLE_INTEGER_KERNELS) -#if defined(__ARM_FEATURE_SVE) + +#if defined(ENABLE_SVE) #define REGISTER_INTEGER_SVE(func_name) &(func_name) -#endif /* defined(__ARM_FEATURE_SVE) */ +#else /* !defined(ENABLE_SVE) */ +#define REGISTER_INTEGER_SVE(func_name) nullptr +#endif /* defined(ENABLE_SVE) */ + +#if defined(ENABLE_NEON) #define REGISTER_INTEGER_NEON(func_name) &(func_name) +#else /* !defined(ENABLE_NEON) */ +#define REGISTER_INTEGER_NEON(func_name) nullptr +#endif /* defined(ENABLE_NEON) */ + #else /* defined(ENABLE_INTEGER_KERNELS) */ #define REGISTER_INTEGER_NEON(func_name) nullptr #define REGISTER_INTEGER_SVE(func_name) nullptr diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp index eb38c18cff..8a57a3b529 100644 --- a/src/core/cpu/kernels/CpuActivationKernel.cpp +++ b/src/core/cpu/kernels/CpuActivationKernel.cpp @@ -60,7 +60,7 @@ struct ActivationKernel static const ActivationKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp16_sve_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, @@ -71,7 +71,8 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) { "fp16_neon_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, @@ -82,9 +83,8 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) }, -#endif /* defined(__ARM_FEATURE_SVE) */ - -#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */ +#endif /* defined(ENABLE_NEON) */ +#if defined(__ARM_FEATURE_SVE2) { "qasymm8_sve_activation", [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, @@ -116,7 +116,7 @@ static const ActivationKernel available_kernels[] = [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) }, -#endif /* defined(__ARM_FEATURE_SVE2) */ +#endif /* defined(__ARM_FEATURE_SVE2) */ }; const ActivationKernel *get_implementation(const ActivationSelectorData &data) diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp index fc88a7e22d..7afdceae38 100644 --- a/src/core/cpu/kernels/CpuAddKernel.cpp +++ b/src/core/cpu/kernels/CpuAddKernel.cpp @@ -61,7 +61,7 @@ struct AddKernel static const AddKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "add_same_sve", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, @@ -102,7 +102,8 @@ static const AddKernel available_kernels[] = [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) { "add_same_neon", [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); }, @@ -145,8 +146,7 @@ static const AddKernel available_kernels[] = [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); }, REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon) }, -#endif /* defined(__ARM_FEATURE_SVE) */ - +#endif /* defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) { "add_qasymm8_sve", @@ -179,7 +179,7 @@ static const AddKernel available_kernels[] = [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); }, REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) }, -#endif /* defined(__ARM_FEATURE_SVE2) */ +#endif /* defined(ENABLE_NEON) */ }; diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp index ddbc48feb8..643a870540 100644 --- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp @@ -76,28 +76,31 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI ARM_COMPUTE_UNUSED(src1, dst); static ElementwiseKernel kernels[] = { -#if defined(__ARM_FEATURE_SVE) - generate_kernel(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op))), - generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op))), + generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op))), + generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) generate_kernel(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op>))), generate_kernel(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>))), -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) - generate_kernel(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op))), - generate_kernel(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op))), -#else /* defined(__ARM_FEATURE_SVE2) */ + generate_kernel(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op))), + generate_kernel(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op))), +#else /* !defined(__ARM_FEATURE_SVE2) */ generate_kernel(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized))), generate_kernel(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed))), #endif /* defined(__ARM_FEATURE_SVE2) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#if defined(__ARM_FEATURE_SVE) - generate_kernel(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) generate_kernel(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op>))), -#endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ generate_kernel(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op>))), +#endif /* defined(ENABLE_NEON) */ }; for(const auto &uk : kernels) @@ -118,31 +121,31 @@ configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInf ARM_COMPUTE_UNUSED(src1, dst); static ElementwiseKernel kernels[] = { -#if defined(__ARM_FEATURE_SVE) - generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op))), - generate_kernel(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_comparison_op))), - generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op))), - generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op))), + generate_kernel(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op))), + generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op))), + generate_kernel(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) generate_kernel(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8))), generate_kernel(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32))), generate_kernel(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16))), generate_kernel(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32))), -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) - generate_kernel(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op))), - generate_kernel(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op))), -#else /* defined(__ARM_FEATURE_SVE2) */ + generate_kernel(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op))), + generate_kernel(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op))), +#else /* !defined(__ARM_FEATURE_SVE2) */ generate_kernel(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed))), generate_kernel(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized))), #endif /* defined(__ARM_FEATURE_SVE2) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#if defined(__ARM_FEATURE_SVE) - generate_kernel(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_comparison_op))), -#else /* defined(__ARM_FEATURE_SVE) */ +#if defined(ENABLE_SVE) + generate_kernel(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op))), +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) generate_kernel(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16))), -#endif /* defined(__ARM_FEATURE_SVE) */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ }; for(const auto &uk : kernels) diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 3a96d93c03..2600a49b70 100644 --- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -54,7 +54,7 @@ struct ElementwiseUnaryKernel static const ElementwiseUnaryKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp32_sve_elementwise_unary", [](DataType dt) { return dt == DataType::F32; }, @@ -70,7 +70,8 @@ static const ElementwiseUnaryKernel available_kernels[] = [](DataType dt) { return dt == DataType::S32; }, REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op), }, -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) +#if defined(ENABLE_NEON) { "fp32_neon_elementwise_unary", [](DataType dt) { return dt == DataType::F32; }, @@ -88,6 +89,7 @@ static const ElementwiseUnaryKernel available_kernels[] = [](DataType dt) { return dt == DataType::S32; }, REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op), }, +#endif // defined(ENABLE_NEON) }; const ElementwiseUnaryKernel *get_implementation(DataType dt) diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp index ed7517111f..29475fa63f 100644 --- a/src/core/cpu/kernels/CpuScaleKernel.cpp +++ b/src/core/cpu/kernels/CpuScaleKernel.cpp @@ -64,38 +64,39 @@ struct ScaleKernel static const ScaleKernel available_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "fp16_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_sve_scale) + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) }, { "f32_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_sve_scale) + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) }, { "qasymm8_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_sve_scale) + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) }, { "qasymm8_signed_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_sve_scale) + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) }, { "u8_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_sve_scale) + REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) }, { "s16_sve_scale", [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_sve_scale) + REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { "common_neon_scale", @@ -128,7 +129,7 @@ static const ScaleKernel available_kernels[] = [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale) }, -#endif /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ }; /** Micro-kernel selector diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp index d2453ed21d..8ea186b16a 100644 --- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp @@ -34,8 +34,8 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/core/common/Registrars.h" -#include "src/core/cpu/kernels/softmax/impl/NEON/list.h" -#include "src/core/cpu/kernels/softmax/impl/SVE/list.h" +#include "src/core/cpu/kernels/softmax/impl/neon/list.h" +#include "src/core/cpu/kernels/softmax/impl/sve/list.h" namespace arm_compute { @@ -69,7 +69,7 @@ struct SoftmaxLogits1DMaxKernel static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "sve_softmax_logits_1d_float", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -80,7 +80,9 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ + +#if defined(ENABLE_NEON) { "neon_softmax_logits_1d_float", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -93,7 +95,7 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float) }, #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* !defined(ENABLE_NEON) */ #if defined(__ARM_FEATURE_SVE2) { @@ -123,7 +125,7 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = { -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) { "sve_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -144,7 +146,8 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max) }, -#else /* !defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_SVE) */ +#if defined(ENABLE_NEON) { "neon_logits_1d_max", [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, @@ -167,7 +170,7 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max) }, -#endif /* defined(__ARM_FEATURE_SVE) */ +#endif /* defined(ENABLE_NEON) */ }; const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data) diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp index bf31fd7d93..e4be1a4faa 100644 --- a/src/core/cpu/kernels/activation/sve/fp16.cpp +++ b/src/core/cpu/kernels/activation/sve/fp16.cpp @@ -28,7 +28,6 @@ #include #include -#if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" #include @@ -126,5 +125,4 @@ void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayer input, output); } } // namespace cpu -} // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp index 75f9f8a4c3..f797944435 100644 --- a/src/core/cpu/kernels/activation/sve/fp32.cpp +++ b/src/core/cpu/kernels/activation/sve/fp32.cpp @@ -29,7 +29,6 @@ #include #include -#if defined(__ARM_FEATURE_SVE) #include namespace arm_compute @@ -127,5 +126,4 @@ void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayer input, output); } } // namespace cpu -} // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp new file mode 100644 index 0000000000..d1660fe19e --- /dev/null +++ b/src/core/cpu/kernels/add/sve/impl.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/cpu/kernels/add/sve/impl.h" +#include + +namespace arm_compute +{ +namespace cpu +{ +template +void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue(); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + const bool is_sat = (policy == ConvertPolicy::SATURATE); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto val1 = svld1(pg, input1_ptr + x); + const auto val2 = svld1(pg, input2_ptr + x); + const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h new file mode 100644 index 0000000000..c38b1d47e0 --- /dev/null +++ b/src/core/cpu/kernels/add/sve/impl.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H +#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H + +#if defined(ENABLE_SVE) +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +namespace arm_compute +{ +namespace cpu +{ +template +void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ENABLE_SVE) +#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp index ae74bfa3eb..6dec140499 100644 --- a/src/core/cpu/kernels/add/sve/integer.cpp +++ b/src/core/cpu/kernels/add/sve/integer.cpp @@ -25,9 +25,8 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#if defined(__ARM_FEATURE_SVE) #include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include namespace arm_compute @@ -197,5 +196,4 @@ void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, add_s16_u8_s16_sve(src1, src0, dst, policy, window); } } // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h index 71dd875ad8..aebb43bb60 100644 --- a/src/core/cpu/kernels/add/sve/list.h +++ b/src/core/cpu/kernels/add/sve/list.h @@ -24,11 +24,12 @@ #ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H #define SRC_CORE_SVE_KERNELS_ADD_LIST_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/cpu/kernels/add/sve/impl.h" #include namespace arm_compute @@ -47,99 +48,7 @@ DECLARE_ADD_KERNEL(add_u8_u8_s16_sve); #undef DECLARE_ADD_KERNEL -template -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - const bool is_sat = (policy == ConvertPolicy::SATURATE); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - - Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); - Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); - Iterator output(dst, window); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(src0, input1_win); - Iterator input2(src1, input2_win); - Iterator output(dst, win); - - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto val1 = svld1(pg, input1_ptr + x); - const auto val2 = svld1(pg, input2_ptr + x); - const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} } // namespace cpu } // namespace arm_compute -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) #endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp new file mode 100644 index 0000000000..2c3bb0ff7c --- /dev/null +++ b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" +#include + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template +struct LoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + const InputScalarType *input2_ptr; + OutputScalarType *output_ptr; +}; + +template +struct BroadcastLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + InputScalarType broadcast_value; + OutputScalarType *output_ptr; + bool reorder; +}; + +template +void arithmetic_op_loop(svbool_t pg, const LoopArguments &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template +void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template +void comparison_op_loop(svbool_t pg, const LoopArguments &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate(pg); + svst1(output_pg, args.output_ptr, res); +} + +template +void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate(pg); + svst1(output_pg, args.output_ptr, res); +} + +template +using LoopFuncType = void (*)(svbool_t, const LoopArguments &); + +template +using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments &); + +template ::type, + typename OutputScalarType = typename sve_scalar::type> +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OperatorType op, + LoopFuncType func, + BroadcastLoopFuncType broadcast_func) +{ + const auto all_true_pg = svptrue(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt(x, window_end_x); + do + { + broadcast_func(pg, + { + op, + non_broadcast_input_ptr + x, + broadcast_value, + output_ptr + x, + !is_broadcast_input_2 + }); + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt(x, window_end_x); + do + { + func(pg, + { + op, + input1_ptr + x, + input2_ptr + x, + output_ptr + x + }); + x += svcnt(); + pg = svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using VectorType = typename sve_vector::type; + + elementwise_op(in1, in2, out, window, op, + &arithmetic_op_loop, + &arithmetic_op_broadcast_loop); +} + +template +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + using InputVectorType = typename sve_vector::type; + using OutputVectorType = typename sve_vector::type; + + elementwise_op(in1, in2, out, window, op, + &comparison_op_loop, + &comparison_op_broadcast_loop); +} + +template <> +svint32_t elementwise_pow(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint32_t elementwise_div(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint16_t elementwise_div(svbool_t &pg, const svint16_t &a, const svint16_t &b) +{ + ARM_COMPUTE_UNUSED(pg, a, b); + ARM_COMPUTE_ERROR("Not supported"); +} + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h index 83c3355de4..a92a8648a8 100644 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h @@ -23,50 +23,62 @@ */ #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/core/NEON/wrapper/svtraits.h" +#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" #include namespace arm_compute { namespace cpu { -namespace sve -{ using namespace arm_compute::wrapper; template -inline VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) +VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) { return svpow_z(pg, a, b); } -template <> -inline svint32_t elementwise_pow(svbool_t &pg, const svint32_t &a, const svint32_t &b) -{ - return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); -} - template -inline VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) +VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) { return svdiv_z(pg, a, b); } -template <> -inline svint32_t elementwise_div(svbool_t &pg, const svint32_t &a, const svint32_t &b) +template +svbool_t narrow_to_byte_predicate(svbool_t pg) { - return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); + const auto all_false = svpfalse(); + + switch(bytewidth) + { + case 8: + pg = svuzp1_b32(pg, all_false); + /* fall through */ + case 4: + pg = svuzp1_b16(pg, all_false); + /* fall through */ + case 2: + pg = svuzp1_b8(pg, all_false); + /* fall through */ + default: + break; + } + return pg; } template -inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) +VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) { - using ScalarType = typename sve_scalar::type; + using ScalarType = typename wrapper::sve_scalar::type; VectorType res{}; switch(op) @@ -108,30 +120,8 @@ inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, c return res; } -template -inline svbool_t narrow_to_byte_predicate(svbool_t pg) -{ - const auto all_false = svpfalse(); - - switch(bytewidth) - { - case 8: - pg = svuzp1_b32(pg, all_false); - /* fall through */ - case 4: - pg = svuzp1_b16(pg, all_false); - /* fall through */ - case 2: - pg = svuzp1_b8(pg, all_false); - /* fall through */ - default: - break; - } - return pg; -} - template -inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) { svbool_t selection_vector{}; @@ -159,10 +149,10 @@ inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVecto ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } - using InputScalarType = typename sve_scalar::type; + using InputScalarType = typename wrapper::sve_scalar::type; selection_vector = narrow_to_byte_predicate(selection_vector); - using OutputScalarType = typename sve_scalar::type; + using OutputScalarType = typename wrapper::sve_scalar::type; const auto false_vector = svdup_n(static_cast((uint32_t)0)); const auto true_vector = svdup_n(static_cast(~(uint32_t)0)); auto ret = svsel(selection_vector, true_vector, false_vector); @@ -170,197 +160,12 @@ inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVecto return ret; } -template -struct LoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; -}; - -template -struct BroadcastLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - InputScalarType broadcast_value; - OutputScalarType *output_ptr; - bool reorder; -}; - -template -inline void arithmetic_op_loop(svbool_t pg, const LoopArguments &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template -inline void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_arithmetic_op::type>(pg, in1, in2, args.op); - svst1(pg, args.output_ptr, res); -} - -template -inline void comparison_op_loop(svbool_t pg, const LoopArguments &args) -{ - const auto in1 = svld1(pg, args.input1_ptr); - const auto in2 = svld1(pg, args.input2_ptr); - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, args.output_ptr, res); -} - -template -inline void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments &args) -{ - const auto non_broadcast_vector = svld1(pg, args.input1_ptr); - const auto broadcast_vector = svdup_n(args.broadcast_value); - const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; - const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; - const auto res = elementwise_comparison_op::type, typename sve_vector::type>(pg, in1, in2, args.op); - const svbool_t output_pg = narrow_to_byte_predicate(pg); - svst1(output_pg, args.output_ptr, res); -} - -template -using LoopFuncType = void (*)(svbool_t, const LoopArguments &); - -template -using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments &); - -template ::type, - typename OutputScalarType = typename sve_scalar::type> -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopFuncType func, - BroadcastLoopFuncType broadcast_func) -{ - const auto all_true_pg = svptrue(); - - // Create input windows - Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); - Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - - // Clear X Dimension on execution window as we handle manually - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - - if(is_broadcast_across_x) - { - const bool is_broadcast_input_2 = input2_win.x().step() == 0; - Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; - Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; - const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; - const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - - // Clear X Dimension on execution window as we handle manually - non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator broadcast_input(broadcast_tensor, broadcast_win); - Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do - { - broadcast_func(pg, - { - op, - non_broadcast_input_ptr + x, - broadcast_value, - output_ptr + x, - !is_broadcast_input_2 - }); - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); - } - else - { - // Clear X Dimension on execution window as we handle manually - input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input1(in1, input1_win); - Iterator input2(in2, input2_win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input1_ptr = reinterpret_cast(input1.ptr()); - const auto input2_ptr = reinterpret_cast(input2.ptr()); - - int x = window_start_x; - - svbool_t pg = svwhilelt(x, window_end_x); - do - { - func(pg, - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x - }); - x += svcnt(); - pg = svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); - } -} - template -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename sve_vector::type; - - elementwise_op(in1, in2, out, window, op, - &arithmetic_op_loop, - &arithmetic_op_broadcast_loop); -} - -template -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename sve_vector::type; - using OutputVectorType = typename sve_vector::type; - - elementwise_op(in1, in2, out, window, op, - &comparison_op_loop, - &comparison_op_broadcast_loop); -} +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} // namespace sve +template +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); } // namespace cpu } // namespace arm_compute -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h index b6342c727c..6c5524e284 100644 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h @@ -26,14 +26,13 @@ #if defined(__ARM_FEATURE_SVE2) +#include "src/core/NEON/wrapper/svtraits.h" #include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h" namespace arm_compute { namespace cpu { -namespace sve -{ using namespace arm_compute::wrapper; template @@ -176,7 +175,7 @@ inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArgumen const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); - using OutputVectorType = typename sve_vector::type; + using OutputVectorType = typename wrapper::traits::sve_vector::type; const auto result = svcreate4( elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), @@ -200,7 +199,7 @@ inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQ const auto &af = args.reorder ? in2 : in1; const auto &bf = args.reorder ? in1 : in2; - using OutputVectorType = typename sve_vector::type; + using OutputVectorType = typename wrapper::traits::sve_vector::type; const auto result = svcreate4( elementwise_comparison_op(pg, svget4(af, 0), svget4(bf, 0), args.op), @@ -221,8 +220,8 @@ template &); template ::type, - typename OutputScalarType = typename sve_scalar::type> + typename InputScalarType = typename wrapper::sve_scalar::type, + typename OutputScalarType = typename wrapper::sve_scalar::type> void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OperatorType op, LoopQuantizedFuncType func, @@ -344,7 +343,7 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o template void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - using VectorType = typename sve_vector::type; + using VectorType = typename wrapper::traits::sve_vector::type; elementwise_quantized_op(in1, in2, out, window, op, &arithmetic_op_quantized_loop, &arithmetic_op_broadcast_quantized_loop); @@ -354,14 +353,12 @@ template = sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename sve_vector::type; - using OutputVectorType = typename sve_vector::type; + using InputVectorType = typename wrapper::traits::sve_vector::type; + using OutputVectorType = typename wrapper::traits::sve_vector::type; elementwise_quantized_op(in1, in2, out, window, op, &comparison_op_quantized_loop, &comparison_op_broadcast_quantized_loop); } - -} // namespace sve } // namespace cpu } // namespace arm_compute diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp new file mode 100644 index 0000000000..cb58548f0b --- /dev/null +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include + +namespace arm_compute +{ +namespace cpu +{ +template +inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return svinvsqrt(pg, a); + case ElementWiseUnary::EXP: + return wrapper::svexp_z(pg, a); + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::LOG: + return wrapper::svlog_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + case ElementWiseUnary::ROUND: + return svrintn_z(pg, a); + case ElementWiseUnary::SIN: + return wrapper::svsin_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template +inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const auto all_true_pg = wrapper::svptrue(); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input_ptr = reinterpret_cast(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input, output); +} + +template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +} // namespace cpu +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h index 23502c71e5..63490421e9 100644 --- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h +++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h @@ -25,87 +25,15 @@ #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H #include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#if defined(__ARM_FEATURE_SVE) -#include "src/core/NEON/SVEMath.h" -#include +#if defined(ENABLE_SVE) namespace arm_compute { namespace cpu { -template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::RSQRT: - return svinvsqrt(pg, a); - case ElementWiseUnary::EXP: - return wrapper::svexp_z(pg, a); - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::LOG: - return wrapper::svlog_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - case ElementWiseUnary::ROUND: - return svrintn_z(pg, a); - case ElementWiseUnary::SIN: - return wrapper::svsin_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - -template -inline typename std::enable_if::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) -{ - switch(op) - { - case ElementWiseUnary::NEG: - return svneg_z(pg, a); - case ElementWiseUnary::ABS: - return svabs_z(pg, a); - default: - ARM_COMPUTE_ERROR("NOT_SUPPORTED"); - } -} - template -void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win = window; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast(output.ptr()); - const auto input_ptr = reinterpret_cast(input.ptr()); - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto vin = svld1(pg, input_ptr + x); - svst1(pg, output_ptr + x, elementwise_op_sve_imp(pg, op, vin)); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input, output); -} - +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); } // namespace cpu } // namespace arm_compute -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) #endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file diff --git a/src/core/cpu/kernels/floor/NEON/fp16.cpp b/src/core/cpu/kernels/floor/NEON/fp16.cpp deleted file mode 100644 index f362676a36..0000000000 --- a/src/core/cpu/kernels/floor/NEON/fp16.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) - -#include "src/common/utils/Validate.h" -#include "src/core/NEON/NEMath.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -constexpr int step = 8; - -void fp16_neon_floor(const void *src, void *dst, int len) -{ - ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); - ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - ARM_COMPUTE_ASSERT(len >= 0); - - auto psrc = static_cast(src); - auto pdst = static_cast<__fp16 *>(dst); - - for(; len >= step; len -= step) - { - vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); - psrc += step; - pdst += step; - } - - for(; len > 0; --len) - { - *pdst = std::floor(*psrc); - ++psrc; - ++pdst; - } -} -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/core/cpu/kernels/floor/NEON/fp32.cpp b/src/core/cpu/kernels/floor/NEON/fp32.cpp deleted file mode 100644 index f5efb2e849..0000000000 --- a/src/core/cpu/kernels/floor/NEON/fp32.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2020-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/common/utils/Validate.h" -#include "src/core/NEON/NEMath.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace cpu -{ -constexpr int step = 4; - -void fp32_neon_floor(const void *src, void *dst, int len) -{ - ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); - ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - ARM_COMPUTE_ASSERT(len >= 0); - - auto psrc = static_cast(src); - auto pdst = static_cast(dst); - - for(; len >= step; len -= step) - { - vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); - psrc += step; - pdst += step; - } - - for(; len > 0; --len) - { - *pdst = std::floor(*psrc); - ++pdst; - ++psrc; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/core/cpu/kernels/floor/neon/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp new file mode 100644 index 0000000000..f362676a36 --- /dev/null +++ b/src/core/cpu/kernels/floor/neon/fp16.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/common/utils/Validate.h" +#include "src/core/NEON/NEMath.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace cpu +{ +constexpr int step = 8; + +void fp16_neon_floor(const void *src, void *dst, int len) +{ + ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); + ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); + ARM_COMPUTE_ASSERT(len >= 0); + + auto psrc = static_cast(src); + auto pdst = static_cast<__fp16 *>(dst); + + for(; len >= step; len -= step) + { + vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); + psrc += step; + pdst += step; + } + + for(; len > 0; --len) + { + *pdst = std::floor(*psrc); + ++psrc; + ++pdst; + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/core/cpu/kernels/floor/neon/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp new file mode 100644 index 0000000000..f5efb2e849 --- /dev/null +++ b/src/core/cpu/kernels/floor/neon/fp32.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/common/utils/Validate.h" +#include "src/core/NEON/NEMath.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace cpu +{ +constexpr int step = 4; + +void fp32_neon_floor(const void *src, void *dst, int len) +{ + ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); + ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); + ARM_COMPUTE_ASSERT(len >= 0); + + auto psrc = static_cast(src); + auto pdst = static_cast(dst); + + for(; len >= step; len -= step) + { + vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); + psrc += step; + pdst += step; + } + + for(; len > 0; --len) + { + *pdst = std::floor(*psrc); + ++pdst; + ++psrc; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp index 99f08dbdf9..5b9377c6e6 100644 --- a/src/core/cpu/kernels/scale/sve/fp16.cpp +++ b/src/core/cpu/kernels/scale/sve/fp16.cpp @@ -21,6 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -30,12 +32,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include #include #include -#if defined(__ARM_FEATURE_SVE) -#include - namespace arm_compute { namespace @@ -173,4 +173,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +#endif // ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp index 94055ae953..05fbedf20d 100644 --- a/src/core/cpu/kernels/scale/sve/fp32.cpp +++ b/src/core/cpu/kernels/scale/sve/fp32.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -33,7 +34,6 @@ #include #include -#if defined(__ARM_FEATURE_SVE) #include namespace arm_compute @@ -171,4 +171,4 @@ void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +#endif // ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp index 2a724ece31..d7e270c661 100644 --- a/src/core/cpu/kernels/scale/sve/integer.cpp +++ b/src/core/cpu/kernels/scale/sve/integer.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -30,12 +31,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include #include #include -#if defined(__ARM_FEATURE_SVE) -#include - namespace arm_compute { namespace @@ -298,4 +297,4 @@ void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +#endif // ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp index c041f14b22..f747037938 100644 --- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp +++ b/src/core/cpu/kernels/scale/sve/qasymm8.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -31,12 +32,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include #include #include -#if defined(__ARM_FEATURE_SVE) -#include - namespace arm_compute { namespace @@ -90,8 +89,8 @@ void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor bool align_corners, const Window &window) { // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; + const int idx_width = 1; + const int idx_height = 2; // Compute the ratio between source height and destination height const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); @@ -205,4 +204,4 @@ void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +#endif // defined(ENABLE_SVE) \ No newline at end of file diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp index 9df4301fe3..584ec7a0da 100644 --- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp +++ b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#if defined(ENABLE_SVE) #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" @@ -31,12 +32,10 @@ #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include #include #include -#if defined(__ARM_FEATURE_SVE) -#include - namespace arm_compute { namespace @@ -90,8 +89,8 @@ void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const I bool align_corners, const Window &window) { // Data layout is NHWC - const int idx_width = 1; - const int idx_height = 2; + const int idx_width = 1; + const int idx_height = 2; // Compute the ratio between source height and destination height const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); @@ -205,4 +204,4 @@ void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *o } // namespace cpu } // namespace arm_compute -#endif // __ARM_FEATURE_SVE \ No newline at end of file +#endif // ENABLE_SVE \ No newline at end of file diff --git a/src/core/cpu/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/NEON/list.h deleted file mode 100644 index 5ebee31272..0000000000 --- a/src/core/cpu/kernels/softmax/impl/NEON/list.h +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H - -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" -#include "support/SaturateCast.h" - -namespace arm_compute -{ -namespace cpu -{ -template -void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - constexpr int window_step_x = 16 / sizeof(T); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - const int sum_stages = log2(window_step_x / 2); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - - for(int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } - - *out_ptr = max_val; - }, - input, output); -} - -template -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) -{ - static_assert(std::is_same::value - || std::is_same::value, - "quantized type should be either qasymm8_t or qasymm8_signed_t."); - - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = vdupq_n_f32(scale_beta); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - constexpr int vec_size = 16; - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - float sum{}; - float sum_inversed{}; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); - - /* Init sum to zero */ - float32x4x4_t vec_sum = - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - }; - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float(vec_elements); - - if(is_log) - { - vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); - vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); - vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); - vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); - } - else - { - vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); - vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); - vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); - vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); - } - - vst4q_f32(tmp_ptr + x, vec_elements_flt); - } - - /* Reduce sum */ - const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); - auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); - sum_res = vpadd_f32(sum_res, sum_res); - sum = wrapper::vgetlane(sum_res, 0); - - /* Run remaining elements */ - for(; x < input_width; ++x) - { - float element{}; - if(is_log) - { - element = (max_val - in_ptr[x]) * scale_beta; - sum += std::exp(element); - } - else - { - element = std::exp((max_val - in_ptr[x]) * scale_beta); - sum += element; - } - - tmp_ptr[x] = element; - } - - if(!is_log) - { - sum_inversed = 256.f / sum; - } - else - { - sum = std::log(sum); - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same::value; - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - using int_vec_type = wrapper::traits::neon_vector_t; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; - if(is_log) - { - const float32x4x4_t sub = - { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), - }; - normalized_value = convert_float_to_int(sub); - } - else - { - float32x4x4_t mul = - { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), - }; - - if(is_qasymm8_signed) - { - const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); - mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); - mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); - mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); - mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); - } - - normalized_value = convert_float_to_int(mul); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum); - } - else - { - out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); - } - } - } - }, - in_it, max_it, out_it); -} - -template -void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - - /** SIMD vector tag type. */ - using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; - - constexpr int vec_size = 16 / sizeof(T); - const int sum_stages = log2(vec_size / 2); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - T sum{}; - T sum_inversed{}; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - - /* Init sum to zero */ - auto vec_sum = wrapper::vdup_n(static_cast(0), ExactTagType{}); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vsub(vec_elements, vec_max); - if(is_log) - { - vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); - } - else - { - vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); - } - wrapper::vstore(tmp_ptr + x, vec_elements); - } - - /* Reduce sum */ - auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); - for(int i = 0; i < sum_stages; ++i) - { - sum_res = wrapper::vpadd(sum_res, sum_res); - } - sum = wrapper::vgetlane(sum_res, 0); - - /* Run remaining elements */ - for(; x < input_width; ++x) - { - T element{}; - - if(is_log) - { - element = (in_ptr[x] - max_val) * beta; - sum += std::exp(element); - } - else - { - element = std::exp((in_ptr[x] - max_val) * beta); - sum += element; - } - tmp_ptr[x] = element; - } - - if(!is_log) - { - sum_inversed = T(1) / sum; - } - else - { - sum = static_cast(std::log(sum)); - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast(0), ExactTagType{}); - if(is_log) - { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast(sum), ExactTagType{})); - } - else - { - normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast(sum_inversed), ExactTagType{})); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = tmp_ptr[x] - sum; - } - else - { - out_ptr[x] = tmp_ptr[x] * sum_inversed; - } - } - } - }, - in_it, max_it, out_it); -} - -} // namespace cpu -} // namespace arm_compute - -#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/core/cpu/kernels/softmax/impl/SVE/list.h b/src/core/cpu/kernels/softmax/impl/SVE/list.h deleted file mode 100644 index d558d7d193..0000000000 --- a/src/core/cpu/kernels/softmax/impl/SVE/list.h +++ /dev/null @@ -1,353 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H -#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H - -#if defined(__ARM_FEATURE_SVE) -#include "arm_compute/core/Types.h" -#include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/SVEMath.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -template -void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) -{ - const auto all_true_pg = wrapper::svptrue(); - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win{ window }; - win.set(Window::DimX, Window::Dimension(0, 1, 1)); - Iterator input(in, win); - Iterator output(out, win); - - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); - - // Init max value - auto vec_max = wrapper::svdup_n(support::cpp11::lowest()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt(x, window_end_x); - do - { - const auto current_value = svld1(pg, in_ptr + x); - vec_max = svmax_m(pg, vec_max, current_value); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - - auto max_val = svmaxv(all_true_pg, vec_max); - - *out_ptr = max_val; - }, - input, output); -} - -#if defined(__ARM_FEATURE_SVE2) -template -void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; - const auto scale_beta_vec = svdup_n_f32(scale_beta); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - const auto all_true_pg = wrapper::svptrue(); - using SVEType = typename wrapper::traits::sve_vector::type; - - const int inc_1 = static_cast(svcntw()); - const int inc_2 = static_cast(2 * svcntw()); - const int inc_3 = static_cast(3 * svcntw()); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - float sum{}; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum_0 = svdup_n_f32(0.f); - auto vec_sum_1 = svdup_n_f32(0.f); - auto vec_sum_2 = svdup_n_f32(0.f); - auto vec_sum_3 = svdup_n_f32(0.f); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svsub_z(pg, vec_max, vec_elements); - - auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements))); - auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements))); - auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements))); - auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements))); - - if(is_log) - { - vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); - vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); - vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); - vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); - } - else - { - vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); - vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); - vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); - vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); - } - - svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); - svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); - svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); - svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); - } - while(svptest_any(all_true_pg, pg)); - - /* Reduce sum */ - const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); - sum = svaddv_f32(all_true_pg, vec_sum); - - /* Run remaining elements */ - x = 0; - if(is_log) - { - sum = std::log(sum); - } - else - { - sum = 256.f / sum; - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same::value; - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); - auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); - auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); - auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); - - svfloat32_t res_0{}; - svfloat32_t res_1{}; - svfloat32_t res_2{}; - svfloat32_t res_3{}; - - if(is_log) - { - res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - } - else - { - res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - - if(is_qasymm8_signed) - { - const auto offset_vec = svdup_n_f32(128.f); - res_0 = svsub_z(pg_0, vec_in_0, offset_vec); - res_1 = svsub_z(pg_1, vec_in_1, offset_vec); - res_2 = svsub_z(pg_2, vec_in_2, offset_vec); - res_3 = svsub_z(pg_3, vec_in_3, offset_vec); - } - } - - // Store value - const auto out = convert_float_to_int(res_0, res_1, res_2, res_3); - svst1(pg, out_ptr + x, out); - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); - } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); -} -#endif /* defined(__ARM_FEATURE_SVE2) */ - -template -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) -{ - const int start_x = in->info()->valid_region().anchor.x(); - const int input_width = in->info()->valid_region().shape.x(); - - Iterator in_it(in, window); - Iterator max_it(max, window); - Iterator out_it(out, window); - - const auto all_true_pg = wrapper::svptrue(); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - ScalarType sum{ 0 }; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum = wrapper::svdup_n(static_cast(0)); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - do - { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svsub_z(pg, vec_elements, vec_max); - if(is_log) - { - vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast(beta))); - vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); - } - else - { - vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast(beta)))); - vec_sum = svadd_m(pg, vec_sum, vec_elements); - } - svst1(pg, tmp_ptr + x, vec_elements); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - } - while(svptest_any(all_true_pg, pg)); - - /* Reduce sum */ - sum = svaddv(all_true_pg, vec_sum); - - if(is_log) - { - sum = static_cast(std::log(sum)); - } - else - { - sum = ScalarType(1) / sum; - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt(x, input_width); - do - { - auto vec_in = svld1(pg, tmp_ptr + x); - auto normalized_value = wrapper::svdup_n(static_cast(0)); - if(is_log) - { - normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); - } - else - { - normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); - } - svst1(pg, out_ptr + x, normalized_value); - - x += wrapper::svcnt(); - pg = wrapper::svwhilelt(x, input_width); - } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); -} - -} // namespace cpu -} // namespace arm_compute -#endif /* defined(__ARM_FEATURE_SVE) */ - -#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/core/cpu/kernels/softmax/impl/neon/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h new file mode 100644 index 0000000000..5ebee31272 --- /dev/null +++ b/src/core/cpu/kernels/softmax/impl/neon/list.h @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H +#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H + +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "support/SaturateCast.h" + +namespace arm_compute +{ +namespace cpu +{ +template +void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(in, win); + Iterator output(out, win); + + const int sum_stages = log2(window_step_x / 2); + execute_window_loop(win, [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); + + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest(), ExactTagType{}); + int x = window_start_x; + + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for(int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + T max_val = wrapper::vgetlane(carry_max, 0); + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; + } + + *out_ptr = max_val; + }, + input, output); +} + +template +void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, float beta, bool is_log, const Window &window) +{ + static_assert(std::is_same::value + || std::is_same::value, + "quantized type should be either qasymm8_t or qasymm8_signed_t."); + + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const auto scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + constexpr int vec_size = 16; + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); + + float sum{}; + float sum_inversed{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = + { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + auto vec_elements_flt = convert_int_to_float(vec_elements); + + if(is_log) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); + } + + /* Reduce sum */ + const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); + + /* Run remaining elements */ + for(; x < input_width; ++x) + { + float element{}; + if(is_log) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } + + tmp_ptr[x] = element; + } + + if(!is_log) + { + sum_inversed = 256.f / sum; + } + else + { + sum = std::log(sum); + } + } + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same::value; + /* Loop over row and compute softmax */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + using int_vec_type = wrapper::traits::neon_vector_t; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if(is_log) + { + const float32x4x4_t sub = + { + vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + }; + normalized_value = convert_float_to_int(sub); + } + else + { + float32x4x4_t mul = + { + vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + }; + + if(is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); + } + /* Run remaining elements */ + for(; x < input_width; ++x) + { + if(is_log) + { + out_ptr[x] = utils::cast::saturate_cast(tmp_ptr[x] - sum); + } + else + { + out_ptr[x] = utils::cast::saturate_cast((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); + } + } + } + }, + in_it, max_it, out_it); +} + +template +void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; + + constexpr int vec_size = 16 / sizeof(T); + const int sum_stages = log2(vec_size / 2); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); + + T sum{}; + T sum_inversed{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); + + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast(0), ExactTagType{}); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if(is_log) + { + vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{})); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast(beta), ExactTagType{}))); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(tmp_ptr + x, vec_elements); + } + + /* Reduce sum */ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for(int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + sum = wrapper::vgetlane(sum_res, 0); + + /* Run remaining elements */ + for(; x < input_width; ++x) + { + T element{}; + + if(is_log) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + tmp_ptr[x] = element; + } + + if(!is_log) + { + sum_inversed = T(1) / sum; + } + else + { + sum = static_cast(std::log(sum)); + } + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + auto vec_in = wrapper::vloadq(tmp_ptr + x); + auto normalized_value = wrapper::vdup_n(static_cast(0), ExactTagType{}); + if(is_log) + { + normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast(sum), ExactTagType{})); + } + else + { + normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast(sum_inversed), ExactTagType{})); + } + wrapper::vstore(out_ptr + x, normalized_value); + } + /* Run remaining elements */ + for(; x < input_width; ++x) + { + if(is_log) + { + out_ptr[x] = tmp_ptr[x] - sum; + } + else + { + out_ptr[x] = tmp_ptr[x] * sum_inversed; + } + } + } + }, + in_it, max_it, out_it); +} + +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp new file mode 100644 index 0000000000..4ed5a4fbea --- /dev/null +++ b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include + +namespace arm_compute +{ +namespace cpu +{ +template +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue(); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); + + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + + auto max_val = svmaxv(all_true_pg, vec_max); + + *out_ptr = max_val; + }, + input, output); +} + +template +void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + + const auto all_true_pg = wrapper::svptrue(); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); + + ScalarType sum{ 0 }; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + do + { + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svsub_z(pg, vec_elements, vec_max); + if(is_log) + { + vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast(beta))); + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + else + { + vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast(beta)))); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + } + while(svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if(is_log) + { + sum = static_cast(std::log(sum)); + } + else + { + sum = ScalarType(1) / sum; + } + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + do + { + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast(0)); + if(is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + } + while(svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} + +template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); + +template void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); +template void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ENABLE_SVE) */ diff --git a/src/core/cpu/kernels/softmax/impl/sve/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h new file mode 100644 index 0000000000..7ddb358b8e --- /dev/null +++ b/src/core/cpu/kernels/softmax/impl/sve/list.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H +#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H + +#if defined(ENABLE_SVE) +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include + +namespace arm_compute +{ +namespace cpu +{ +template +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); + +template +void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); + +#if defined(__ARM_FEATURE_SVE2) +template +void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const auto scale_beta_vec = svdup_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + const auto all_true_pg = wrapper::svptrue(); + using SVEType = typename wrapper::traits::sve_vector::type; + + const int inc_1 = static_cast(svcntw()); + const int inc_2 = static_cast(2 * svcntw()); + const int inc_3 = static_cast(3 * svcntw()); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast(tmp); + + float sum{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum_0 = svdup_n_f32(0.f); + auto vec_sum_1 = svdup_n_f32(0.f); + auto vec_sum_2 = svdup_n_f32(0.f); + auto vec_sum_3 = svdup_n_f32(0.f); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svsub_z(pg, vec_max, vec_elements); + + auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements))); + auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements))); + auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements))); + auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements))); + + if(is_log) + { + vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); + vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); + vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); + vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + } + else + { + vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); + vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); + vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); + vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + } + + svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); + svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); + svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); + svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } + while(svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); + sum = svaddv_f32(all_true_pg, vec_sum); + + /* Run remaining elements */ + x = 0; + if(is_log) + { + sum = std::log(sum); + } + else + { + sum = 256.f / sum; + } + } + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same::value; + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); + auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); + auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); + auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); + + svfloat32_t res_0{}; + svfloat32_t res_1{}; + svfloat32_t res_2{}; + svfloat32_t res_3{}; + + if(is_log) + { + res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + } + else + { + res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + + if(is_qasymm8_signed) + { + const auto offset_vec = svdup_n_f32(128.f); + res_0 = svsub_z(pg_0, vec_in_0, offset_vec); + res_1 = svsub_z(pg_1, vec_in_1, offset_vec); + res_2 = svsub_z(pg_2, vec_in_2, offset_vec); + res_3 = svsub_z(pg_3, vec_in_3, offset_vec); + } + } + + // Store value + const auto out = convert_float_to_int(res_0, res_1, res_2, res_3); + svst1(pg, out_ptr + x, out); + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } + while(svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} +#endif /* defined(__ARM_FEATURE_SVE2) */ +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ENABLE_SVE) */ + +#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */ diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp index 577603d07d..111e969bae 100644 --- a/tests/validation/NEON/ActivationLayer.cpp +++ b/tests/validation/NEON/ActivationLayer.cpp @@ -68,11 +68,11 @@ RelativeTolerance relative_tolerance(DataType data_type, ActivationLayerI switch(data_type) { case DataType::F16: -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) return RelativeTolerance(0.25f); -#else // !defined(__ARM_FEATURE_SVE) +#else // !defined(ENABLE_SVE) return RelativeTolerance(0.1f); -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) default: return RelativeTolerance(0.05f); } @@ -80,11 +80,11 @@ RelativeTolerance relative_tolerance(DataType data_type, ActivationLayerI switch(data_type) { case DataType::F16: -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) return RelativeTolerance(0.9f); -#else // !defined(__ARM_FEATURE_SVE) +#else // !defined(ENABLE_SVE) return RelativeTolerance(0.01f); -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) default: return RelativeTolerance(0.00001f); } @@ -111,11 +111,11 @@ AbsoluteTolerance absolute_tolerance(DataType data_type, ActivationLayerI switch(data_type) { case DataType::F16: -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) return AbsoluteTolerance(0.25f); -#else // !defined(__ARM_FEATURE_SVE) +#else // !defined(ENABLE_SVE) return AbsoluteTolerance(0.01f); -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) default: return AbsoluteTolerance(0.00001f); } @@ -123,11 +123,11 @@ AbsoluteTolerance absolute_tolerance(DataType data_type, ActivationLayerI switch(data_type) { case DataType::F16: -#if defined(__ARM_FEATURE_SVE) +#if defined(ENABLE_SVE) return AbsoluteTolerance(0.9f); -#else // !defined(__ARM_FEATURE_SVE) +#else // !defined(ENABLE_SVE) return AbsoluteTolerance(0.01f); -#endif // defined(__ARM_FEATURE_SVE) +#endif // defined(ENABLE_SVE) default: return AbsoluteTolerance(0.00001f); } diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp index 98341805ed..ea6656eefe 100644 --- a/tests/validation/NEON/ArithmeticAddition.cpp +++ b/tests/validation/NEON/ArithmeticAddition.cpp @@ -43,11 +43,11 @@ namespace validation { namespace { -#if !defined(__aarch64__) || defined(__ARM_FEATURE_SVE) +#if !defined(__aarch64__) || defined(ENABLE_SVE) constexpr AbsoluteTolerance tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ -#else // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE) +#else // !defined(__aarch64__) || defined(ENABLE_SVE) constexpr AbsoluteTolerance tolerance_quant(0); -#endif // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE) +#endif // !defined(__aarch64__) || defined(ENABLE_SVE) /** Input data sets **/ const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType", -- cgit v1.2.1