From b6af482bc5d8e4f03f876e17909c561de198c4d3 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 14 Sep 2021 12:33:34 +0100 Subject: Per-operator build dependencies Creates a list of operators their respective dependencies. Alters the build system to walk-through them resolve the dependencies and build Compute Library. Removes the following unused kernels/functions: -[NE|CL]MinMaxLayerKernel -CLFillBorder Resolves: COMPMID-4695,COMPMID-4696 Signed-off-by: Georgios Pinitas Change-Id: I35ebeef38dac25ec5459cfe9c5f7c9a708621124 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/357914 Tested-by: bsgcomp Reviewed-by: Michele DiGiorgio Comments-Addressed: bsgcomp Signed-off-by: Freddie Liardet Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6295 Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 3 - SConscript | 305 +-- SConstruct | 12 +- arm_compute/runtime/CL/CLFunctions.h | 1 - arm_compute/runtime/CL/functions/CLFillBorder.h | 67 - docs/user_guide/library.dox | 29 + docs/user_guide/operator_list.dox | 17 +- docs/user_guide/release_version_and_change_log.dox | 4 +- filelist.json | 2569 +++++++++++--------- src/core/CL/CLKernels.h | 1 - src/core/CL/kernels/CLMinMaxLayerKernel.cpp | 169 -- src/core/CL/kernels/CLMinMaxLayerKernel.h | 87 - src/core/NEON/NEKernels.h | 1 - src/core/NEON/kernels/NEMinMaxLayerKernel.cpp | 224 -- src/core/NEON/kernels/NEMinMaxLayerKernel.h | 90 - .../kernels/arm_conv/depthwise/depthwise_s8q.cpp | 2 - src/runtime/CL/functions/CLFillBorder.cpp | 45 - tests/framework/instruments/OpenCLTimer.cpp | 45 +- tests/framework/instruments/OpenCLTimer.h | 10 +- tests/framework/instruments/SchedulerTimer.cpp | 34 +- tests/framework/instruments/SchedulerTimer.h | 16 +- 21 files changed, 1645 insertions(+), 2086 deletions(-) delete mode 100644 arm_compute/runtime/CL/functions/CLFillBorder.h delete mode 100644 src/core/CL/kernels/CLMinMaxLayerKernel.cpp delete mode 100644 src/core/CL/kernels/CLMinMaxLayerKernel.h delete mode 100644 src/core/NEON/kernels/NEMinMaxLayerKernel.cpp delete mode 100644 src/core/NEON/kernels/NEMinMaxLayerKernel.h delete mode 100644 src/runtime/CL/functions/CLFillBorder.cpp diff --git a/Android.bp b/Android.bp index 9b6808eb9a..8b73de5f2f 100644 --- a/Android.bp +++ b/Android.bp @@ -226,7 +226,6 @@ cc_library_static { "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp", "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp", "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp", - "src/core/CL/kernels/CLMinMaxLayerKernel.cpp", "src/core/CL/kernels/CLNormalizationLayerKernel.cpp", "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp", "src/core/CL/kernels/CLPadLayerKernel.cpp", @@ -280,7 +279,6 @@ cc_library_static { "src/core/NEON/kernels/NELogicalKernel.cpp", "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp", "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp", - "src/core/NEON/kernels/NEMinMaxLayerKernel.cpp", "src/core/NEON/kernels/NENormalizationLayerKernel.cpp", "src/core/NEON/kernels/NEPadLayerKernel.cpp", "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp", @@ -639,7 +637,6 @@ cc_library_static { "src/runtime/CL/functions/CLFFT2D.cpp", "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp", "src/runtime/CL/functions/CLFill.cpp", - "src/runtime/CL/functions/CLFillBorder.cpp", "src/runtime/CL/functions/CLFlattenLayer.cpp", "src/runtime/CL/functions/CLFloor.cpp", "src/runtime/CL/functions/CLFullyConnectedLayer.cpp", diff --git a/SConscript b/SConscript index df8f33a917..c88a86773c 100644 --- a/SConscript +++ b/SConscript @@ -38,27 +38,27 @@ Import('vars') Import('install_lib') def build_bootcode_objs(sources): - arm_compute_env.Append(ASFLAGS = "-I bootcode/") obj = arm_compute_env.Object(sources) obj = install_lib(obj) Default(obj) return obj -def build_sve_objs(sources): +def build_sve_objs(sources): tmp_env = arm_compute_env.Clone() tmp_env.Append(CXXFLAGS = "-march=armv8.2-a+sve+fp16") obj = tmp_env.SharedObject(sources) Default(obj) return obj -def build_objs(sources): +def build_objs(sources): obj = arm_compute_env.SharedObject(sources) Default(obj) return obj + def build_library(name, build_env, sources, static=False, libs=[]): if static: obj = build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs) @@ -72,6 +72,7 @@ def build_library(name, build_env, sources, static=False, libs=[]): Default(obj) return obj + def remove_incode_comments(code): def replace_with_empty(match): s = match.group(0) @@ -83,6 +84,7 @@ def remove_incode_comments(code): comment_regex = re.compile(r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE) return re.sub(comment_regex, replace_with_empty, code) + def resolve_includes(target, source, env): # File collection FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') @@ -142,6 +144,7 @@ def resolve_includes(target, source, env): file_to_write = "R\"(" + file_to_write + ")\"" out_file.write(file_to_write) + def create_version_file(target, source, env): # Generate string with build options library version to embed in the library: try: @@ -153,72 +156,87 @@ def create_version_file(target, source, env): with open(target[0].get_path(), "w") as fd: fd.write(build_info) -def get_cpu_runtime_files(operator): - file_list = [] - operators = filelist['cpu']['operators'] - - if "operator" in operators[operator]["files"]: - file_list += operators[operator]["files"]["operator"] - return file_list - -def get_gpu_runtime_files(operator): - file_list = [] - operators = filelist['gpu']['operators'] - - if "operator" in operators[operator]["files"]: - file_list += operators[operator]["files"]["operator"] - return file_list - -def get_cpu_kernel_files(operator): - - file_list = [] - file_list_sve = [] - operators = filelist['cpu']['operators'] - - if env['estate'] == '64' and "neon" in operators[operator]['files'] and "estate64" in operators[operator]['files']['neon']: - file_list += operators[operator]['files']['neon']['estate64'] - if env['estate'] == '32' and "neon" in operators[operator]['files'] and "estate32" in operators[operator]['files']['neon']: - file_list += operators[operator]['files']['neon']['estate32'] - - if "kernel" in operators[operator]["files"]: - file_list += operators[operator]["files"]["kernel"] - - if ("neon" in operators[operator]["files"]): - if any(i in env['data_type_support'] for i in ['all', 'qasymm8']) and ("qasymm8" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["qasymm8"] - if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']) and ("qasymm8_signed" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["qasymm8_signed"] - if any(i in env['data_type_support'] for i in ['all', 'qsymm16']) and ("qsymm16" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["qsymm16"] - if any(i in env['data_type_support'] for i in ['all', 'integer']) and ("integer" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["integer"] - - if (not "sve" in env['arch'] or env['fat_binary']) and ("neon" in operators[operator]["files"]): - if any(i in env['data_type_support'] for i in ['all', 'fp16']) and ("fp16" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["fp16"] - if any(i in env['data_type_support'] for i in ['all', 'fp32']) and ("fp32" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["fp32"] - if any(i in env['data_layout_support'] for i in ['all', 'nchw']) and ("nchw" in operators[operator]["files"]["neon"]): - file_list += operators[operator]['files']['neon']['nchw'] - if ("all" in operators[operator]["files"]["neon"]): - file_list += operators[operator]["files"]["neon"]["all"] - if ("sve" in env['arch'] or env['fat_binary']) and ("sve" in operators[operator]["files"]): - if any(i in env['data_type_support'] for i in ['all', 'fp16']) and ("fp16" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["fp16"] - if any(i in env['data_type_support'] for i in ['all', 'fp32']) and ("fp32" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["fp32"] - if any(i in env['data_type_support'] for i in ['all', 'qasymm8']) and ("qasymm8" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["qasymm8"] - if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']) and ("qasymm8_signed" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["qasymm8_signed"] - if any(i in env['data_type_support'] for i in ['all', 'qsymm16']) and ("qsymm16" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["qsymm16"] - if any(i in env['data_type_support'] for i in ['all', 'integer']) and ("integer" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["integer"] - if ("all" in operators[operator]["files"]["sve"]): - file_list_sve += operators[operator]["files"]["sve"]["all"] - - return file_list, file_list_sve + +def get_attrs_list(arch, estate, data_types, data_layouts): + attrs = [] + + # Manage data-types + if any(i in data_types for i in ['all']): + attrs += ['fp16', 'fp32', 'integer', 'qasymm8', 'qasymm8_signed', 'qsymm16'] + else: + if any(i in data_types for i in ['fp16']): attrs += ['fp16'] + if any(i in data_types for i in ['fp32']): attrs += ['fp32'] + if any(i in data_types for i in ['integer']): attrs += ['integer'] + if any(i in data_types for i in ['qasymm8']): attrs += ['qasymm8'] + if any(i in data_types for i in ['qasymm8_signed']): attrs += ['qasymm8_signed'] + if any(i in data_types for i in ['qsymm16']): attrs += ['qsymm16'] + + # Manage data-layouts + if any(i in data_layouts for i in ['all']): + attrs += ['nhwc', 'nchw'] + else: + if any(i in data_layouts for i in ['nhwc']): attrs += ['nhwc'] + if any(i in data_layouts for i in ['nchw']): attrs += ['nchw'] + + # Manage execution state + estate_attr = 'estate32' if (estate == 'auto' and 'v7a' in arch) or '32' in estate else 'estate64' + attrs += [ estate_attr ] + + return attrs + + +def get_operator_backend_files(filelist, operators, backend='', techs=[], attrs=[]): + files = { "common" : [] } + + # Early return if filelist is empty + if backend not in filelist: + return files + + # Iterate over operators and create the file lists to compiler + for operator in operators: + if operator in filelist[backend]['operators']: + files['common'] += filelist[backend]['operators'][operator]["files"]["common"] + for tech in techs: + if tech in filelist[backend]['operators'][operator]["files"]: + # Add tech as a key to dictionary if not there + if tech not in files: + files[tech] = [] + + # Add tech files to the tech file list + tech_files = filelist[backend]['operators'][operator]["files"][tech] + files[tech] += tech_files.get('common', []) + for attr in attrs: + files[tech] += tech_files.get(attr, []) + + # Remove duplicates if they exist + return {k: list(set(v)) for k,v in files.items()} + +def collect_operators(filelist, operators, backend=''): + ops = set() + for operator in operators: + if operator in filelist[backend]['operators']: + ops.add(operator) + if 'deps' in filelist[backend]['operators'][operator]: + ops.update(filelist[backend]['operators'][operator]['deps']) + else: + print("Operator {0} is unsupported on {1} backend!".format(operator, backend)) + + return ops + + +def resolve_operator_dependencies(filelist, operators, backend=''): + resolved_operators = collect_operators(filelist, operators, backend) + + are_ops_resolved = False + while not are_ops_resolved: + resolution_pass = collect_operators(filelist, resolved_operators, backend) + if len(resolution_pass) != len(resolved_operators): + resolved_operators.update(resolution_pass) + else: + are_ops_resolved = True + + return resolved_operators + arm_compute_env = env.Clone() version_file = arm_compute_env.Command("src/core/arm_compute_version.embed", "", action=create_version_file) @@ -385,70 +403,61 @@ arm_compute_env.Append(LIBS = ['dl']) with (open(Dir('#').path + '/filelist.json')) as fp: filelist = json.load(fp) -core_files = Glob('src/core/*.cpp') -core_files += Glob('src/core/CPP/*.cpp') -core_files += Glob('src/core/CPP/kernels/*.cpp') -core_files += Glob('src/core/helpers/*.cpp') -core_files += Glob('src/core/utils/*.cpp') -core_files += Glob('src/core/utils/helpers/*.cpp') -core_files += Glob('src/core/utils/io/*.cpp') -core_files += Glob('src/core/utils/quantization/*.cpp') -core_files += Glob('src/core/utils/misc/*.cpp') -if env["logging"]: - core_files += Glob('src/core/utils/logging/*.cpp') +# Common backend files +lib_files = filelist['common'] -runtime_files_hp = Glob('src/runtime/*.cpp') -runtime_files_hp += Glob('src/runtime/CPP/ICPPSimpleFunction.cpp') -runtime_files = Glob('src/runtime/CPP/functions/*.cpp') +# Logging files +if env["logging"]: + lib_files += filelist['logging'] # C API files -runtime_files_hp += filelist['c_api']['common'] -runtime_files_hp += filelist['c_api']['operators'] +lib_files += filelist['c_api']['common'] +lib_files += filelist['c_api']['operators'] -if env['opencl']: - runtime_files_hp += filelist['c_api']['gpu'] - -# Common backend files -core_files += filelist['common'] - -# Initialize high priority core files -core_files_hp = core_files -core_files_sve_hp = [] -core_files = [] - -runtime_files_hp += Glob('src/runtime/CPP/SingleThreadScheduler.cpp') +# Scheduler infrastructure +lib_files += filelist['scheduler']['single'] +if env['cppthreads']: + lib_files += filelist['scheduler']['threads'] +if env['openmp']: + lib_files += filelist['scheduler']['omp'] +# Graph files graph_files = Glob('src/graph/*.cpp') graph_files += Glob('src/graph/*/*.cpp') -if env['cppthreads']: - runtime_files_hp += Glob('src/runtime/CPP/CPPScheduler.cpp') - -if env['openmp']: - runtime_files_hp += Glob('src/runtime/OMP/OMPScheduler.cpp') +# Specify user-defined priority operators +use_priority_ops = env['high_priority'] +priority_operators = filelist['high_priority'] +if env['build_config'] != "": + build_config = env['build_config'] + build_config_contents = {} + if os.path.isfile(build_config): + with open(build_config) as f: + try: + build_config_contents = json.load(f) + except: + print("Warning: Build configuration file is of invalid JSON format!") + else: + try: + build_config_contents = json.loads(build_config) + except: + print("Warning: Build configuration string is of invalid JSON format!") + if build_config_contents: + priority_operators = build_config_contents.get("operators", []) if env['opencl']: - operators = filelist['gpu']['operators'] - for operator in operators: - if operator in filelist['gpu']['high_priority']: - runtime_files_hp += get_gpu_runtime_files(operator) - if "kernel" in operators[operator]["files"]: - core_files_hp += operators[operator]["files"]["kernel"] - else: - runtime_files += get_gpu_runtime_files(operator) - if "kernel" in operators[operator]["files"]: - core_files += operators[operator]["files"]["kernel"] + lib_files += filelist['c_api']['gpu'] + lib_files += filelist['gpu']['common'] - runtime_files_hp += filelist['gpu']['common'] - runtime_files += Glob('src/runtime/CL/functions/*.cpp') + cl_operators = priority_operators if use_priority_ops else filelist['gpu']['operators'].keys() + cl_ops_to_build = resolve_operator_dependencies(filelist, cl_operators, 'gpu') + lib_files += get_operator_backend_files(filelist, cl_ops_to_build, 'gpu')['common'] graph_files += Glob('src/graph/backends/CL/*.cpp') sve_o = [] -core_files_sve = [] +lib_files_sve = [] if env['neon']: - core_files += Glob('src/core/NEON/*.cpp') - # build winograd/depthwise sources for either v7a / v8a arm_compute_env.Append(CPPPATH = ["src/core/NEON/kernels/convolution/common/", "src/core/NEON/kernels/convolution/winograd/", @@ -457,58 +466,55 @@ if env['neon']: "arm_compute/core/NEON/kernels/assembly/", "src/cpu/kernels/assembly/",]) - # Load files based on user's options - operators = filelist['cpu']['operators'] - for operator in operators: - if operator in filelist['cpu']['high_priority']: - runtime_files_hp += get_cpu_runtime_files(operator) - file_list, file_list_sve = get_cpu_kernel_files(operator) - core_files_hp += file_list - core_files_sve_hp += file_list_sve - else: - runtime_files += get_cpu_runtime_files(operator) - file_list, file_list_sve = get_cpu_kernel_files(operator) - core_files += file_list - core_files_sve += file_list_sve + lib_files += filelist['cpu']['common'] + + # Setup SIMD file list to include + simd = [] + if 'sve' in env['arch'] or env['fat_binary']: simd += ['sve'] + if 'sve' not in env['arch'] or env['fat_binary']: simd += ['neon'] - runtime_files_hp += filelist['cpu']['common'] - runtime_files_hp += Glob('src/runtime/NEON/*.cpp') - runtime_files += Glob('src/runtime/NEON/functions/*.cpp') + # Get attributes + attrs = get_attrs_list(env['arch'], env['estate'], env['data_type_support'], env['data_layout_support']) + + # Setup data-type and data-layout files to include + cpu_operators = priority_operators if use_priority_ops else filelist['cpu']['operators'].keys() + cpu_ops_to_build = resolve_operator_dependencies(filelist, filelist['cpu']['operators'], 'cpu') + cpu_files = get_operator_backend_files(filelist, cpu_ops_to_build, 'cpu', simd, attrs) + lib_files += cpu_files.get('common', []) + lib_files += cpu_files.get('neon', []) + lib_files_sve += cpu_files.get('sve', []) graph_files += Glob('src/graph/backends/NEON/*.cpp') +# Restrict from building graph API if a reduced operator list has been provided +if use_priority_ops: + print("Graph library requires all operators to be built") + graph_files = [] + +# Build bootcode in case of bare-metal bootcode_o = [] if env['os'] == 'bare_metal': bootcode_files = Glob('bootcode/*.s') bootcode_o = build_bootcode_objs(bootcode_files) Export('bootcode_o') -high_priority_o = build_objs(core_files_hp + runtime_files_hp) -high_priority_sve_o = [] +# Build static libraries if (env['fat_binary']): - sve_o = build_sve_objs(core_files_sve) - high_priority_sve_o = build_sve_objs(core_files_sve_hp) - arm_compute_a = build_library('arm_compute-static', arm_compute_env, core_files + sve_o + high_priority_o + high_priority_sve_o + runtime_files, static=True) + sve_o = build_sve_objs(lib_files_sve) + arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files + sve_o, static=True) else: - high_priority_o += build_objs(core_files_sve_hp) - arm_compute_a = build_library('arm_compute-static', arm_compute_env, core_files + core_files_sve + high_priority_o + runtime_files, static=True) + arm_compute_a = build_library('arm_compute-static', arm_compute_env, lib_files + lib_files_sve, static=True) Export('arm_compute_a') -if env['high_priority']: - arm_compute_hp_a = build_library('arm_compute_hp-static', arm_compute_env, high_priority_o + high_priority_sve_o, static=True) - Export('arm_compute_hp_a') +# Build shared libraries if env['os'] != 'bare_metal' and not env['standalone']: if (env['fat_binary']): - arm_compute_so = build_library('arm_compute', arm_compute_env, core_files + sve_o + high_priority_sve_o + high_priority_o + runtime_files, static=False) + arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files + sve_o, static=False) else: - arm_compute_so = build_library('arm_compute', arm_compute_env, core_files + core_files_sve + high_priority_o + runtime_files , static=False) + arm_compute_so = build_library('arm_compute', arm_compute_env, lib_files + lib_files_sve, static=False) Export('arm_compute_so') - if env['high_priority']: - arm_compute_hp_so = build_library('arm_compute_hp', arm_compute_env, high_priority_sve_o + high_priority_o, static=False) - Export('arm_compute_hp_so') - # Generate dummy core lib for backwards compatibility arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, [], static=True) Export('arm_compute_core_a') @@ -519,6 +525,7 @@ if env['os'] != 'bare_metal' and not env['standalone']: arm_compute_graph_env = arm_compute_env.Clone() +# Build graph libraries arm_compute_graph_env.Append(CXXFLAGS = ['-Wno-redundant-move', '-Wno-pessimizing-move']) arm_compute_graph_a = build_library('arm_compute_graph-static', arm_compute_graph_env, graph_files, static=True, libs = [ arm_compute_a]) diff --git a/SConstruct b/SConstruct index ee8108bf71..7591075cd1 100644 --- a/SConstruct +++ b/SConstruct @@ -23,8 +23,10 @@ # SOFTWARE. import SCons +import json import os import subprocess +import sys def version_at_least(version, required): @@ -76,7 +78,8 @@ vars.AddVariables( ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", ""), ("extra_link_flags", "Extra LD flags to be appended to the build command", ""), ("compiler_cache", "Command to prefix to the C and C++ compiler (e.g ccache)", ""), - ("specs_file", "Specs file to use (e.g. rdimon.specs)", "") + ("specs_file", "Specs file to use (e.g. rdimon.specs)", ""), + ("build_config", "Operator/Data-type/Data-layout configuration to use for tailored ComputeLibrary builds. Can be a JSON file or a JSON formatted string", "") ) env = Environment(platform="posix", variables=vars, ENV = os.environ) @@ -317,6 +320,13 @@ if env['fat_binary']: '-DARM_COMPUTE_ENABLE_FP16', '-DARM_COMPUTE_ENABLE_BF16', '-DARM_COMPUTE_ENABLE_I8MM', '-DARM_COMPUTE_ENABLE_SVEF32MM']) +if env['high_priority'] and env['build_config']: + print("The high priority library cannot be built in conjuction with a user-specified build configuration") + Exit(1) + +if not env['high_priority'] and not env['build_config']: + env.Append(CPPDEFINES = ['ARM_COMPUTE_GRAPH_ENABLED']) + if env['data_type_support']: if any(i in env['data_type_support'] for i in ['all', 'fp16']): env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS']) diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 62c94152e8..442d407660 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -57,7 +57,6 @@ #include "arm_compute/runtime/CL/functions/CLFFT2D.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLFill.h" -#include "arm_compute/runtime/CL/functions/CLFillBorder.h" #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" #include "arm_compute/runtime/CL/functions/CLFloor.h" #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h deleted file mode 100644 index 20f2e15b72..0000000000 --- a/arm_compute/runtime/CL/functions/CLFillBorder.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLFILLBORDER_H -#define ARM_COMPUTE_CLFILLBORDER_H - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/ICLSimpleFunction.h" - -namespace arm_compute -{ -class CLCompileContext; -class ICLTensor; - -/** Basic function to run @ref CLFillBorderKernel */ -class CLFillBorder : public ICLSimpleFunction -{ -public: - /** Initialize the function - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------| - * |All |All | - * - * @param[in,out] tensor Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. - * @param[in] border_width The border width - * @param[in] border_mode Strategy to use for borders. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. - */ - void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); - /** Initialize the function - * - * @param[in] compile_context The compile context to be used. - * @param[in,out] tensor Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32. - * @param[in] border_width The border width - * @param[in] border_mode Strategy to use for borders. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. - */ - void configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); -}; -} -#endif /*ARM_COMPUTE_FILLBORDER_H */ diff --git a/docs/user_guide/library.dox b/docs/user_guide/library.dox index 6c7b7e941f..fc08dbc437 100644 --- a/docs/user_guide/library.dox +++ b/docs/user_guide/library.dox @@ -561,6 +561,35 @@ Selecting fat_binary when building Compute Library, will create a library that c Based on the CPU support, the appropriate kernel will be selected at runtime for execution. Currently this option is only supported with armv8.2-a as the base architecture. +@subsection architecture_experimental_per_operator_build Per-operator build + +Dependencies for all operators have been explicitly defined, this provides the ability to users to generate Compute Library +binaries that include a user-defined list of operators. + +An experimental flag 'build_config' has been introduced where a JSON configuration file can be provided and consumed. +An example config looks like: +@code{.py} +{ + "operators": [ + "Activation", + "DepthwiseConv2d", + "Conv2d", + "Permute", + "Pool2d", + "Reshape" + ], + "data_types": [ + "NHWC" + ] +} +@endcode + +Supported data-types options are: +- "NHWC" +- "NCHW" + +The list of supported operators can be found in filelist.json in the root of Compute Library repo. + @subsection architecture_experimental_build_high_priority_operators Build high priority operators Selecting high_priority when building Compute Library, one new library will be created: libarm_compute_hp and diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox index 92b8f9b482..27ba52d72e 100644 --- a/docs/user_guide/operator_list.dox +++ b/docs/user_guide/operator_list.dox @@ -1404,9 +1404,9 @@ where N = batches, C = channels, H = height, W = width AllAll - FillBorder - Function to fill the borders within the XY-planes. - + FillBorder + Function to fill the borders within the XY-planes. +
  • n/a
@@ -1420,17 +1420,6 @@ where N = batches, C = channels, H = height, W = width srcdst AllAll - - CLFillBorder - -
    -
  • All -
- - -
srcdst -
AllAll -
FlattenLayer Reshape a tensor to be 1D diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index 2eb9aacce7..583cf4fb82 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -1315,7 +1315,7 @@ v17.09 Public major release - NEDequantizationLayerKernel / @ref NEDequantizationLayer - NEFloorKernel / @ref NEFloor - @ref NEL2NormalizeLayerKernel / @ref NEL2NormalizeLayer - - NEQuantizationLayerKernel @ref NEMinMaxLayerKernel / @ref NEQuantizationLayer + - NEQuantizationLayerKernel NEMinMaxLayerKernel / @ref NEQuantizationLayer - @ref NEROIPoolingLayerKernel / @ref NEROIPoolingLayer - @ref NEReductionOperationKernel / @ref NEReductionOperation - NEReshapeLayerKernel / @ref NEReshapeLayer @@ -1329,7 +1329,7 @@ v17.09 Public major release - CLGEMMTranspose1xW - CLGEMMMatrixVectorMultiplyKernel - @ref CLL2NormalizeLayerKernel / @ref CLL2NormalizeLayer - - CLQuantizationLayerKernel @ref CLMinMaxLayerKernel / @ref CLQuantizationLayer + - CLQuantizationLayerKernel CLMinMaxLayerKernel / @ref CLQuantizationLayer - @ref CLROIPoolingLayerKernel / @ref CLROIPoolingLayer - @ref CLReductionOperationKernel / @ref CLReductionOperation - CLReshapeLayerKernel / @ref CLReshapeLayer diff --git a/filelist.json b/filelist.json index 5171f39e12..4b85408e3d 100644 --- a/filelist.json +++ b/filelist.json @@ -7,8 +7,80 @@ "src/common/AllocatorWrapper.cpp", "src/common/ITensorV2.cpp", "src/common/TensorPack.cpp", - "src/common/IOperator.cpp" + "src/common/IOperator.cpp", + "src/core/AccessWindowAutoPadding.cpp", + "src/core/AccessWindowStatic.cpp", + "src/core/AccessWindowTranspose.cpp", + "src/core/Error.cpp", + "src/core/GPUTarget.cpp", + "src/core/Helpers.cpp", + "src/core/IAccessWindow.cpp", + "src/core/IKernel.cpp", + "src/core/ITensor.cpp", + "src/core/ITensorPack.cpp", + "src/core/Rounding.cpp", + "src/core/Size2D.cpp", + "src/core/SubTensorInfo.cpp", + "src/core/TensorInfo.cpp", + "src/core/Utils.cpp", + "src/core/Validate.cpp", + "src/core/Version.cpp", + "src/core/helpers/SoftmaxHelpers.cpp", + "src/core/helpers/WindowHelpers.cpp", + "src/core/utils/AssemblyUtils.cpp", + "src/core/utils/ScaleUtils.cpp", + "src/core/utils/helpers/fft.cpp", + "src/core/utils/helpers/tensor_transform.cpp", + "src/core/utils/io/FileHandler.cpp", + "src/core/utils/misc/MMappedFile.cpp", + "src/core/utils/quantization/AsymmHelpers.cpp", + "src/core/CPP/CPPTypes.cpp", + "src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp", + "src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp", + "src/core/CPP/kernels/CPPPermuteKernel.cpp", + "src/core/CPP/kernels/CPPTopKVKernel.cpp", + "src/core/CPP/kernels/CPPUpsampleKernel.cpp", + "src/runtime/Allocator.cpp", + "src/runtime/BlobLifetimeManager.cpp", + "src/runtime/BlobMemoryPool.cpp", + "src/runtime/ISimpleLifetimeManager.cpp", + "src/runtime/ITensorAllocator.cpp", + "src/runtime/IWeightsManager.cpp", + "src/runtime/IScheduler.cpp", + "src/runtime/Memory.cpp", + "src/runtime/MemoryManagerOnDemand.cpp", + "src/runtime/OffsetLifetimeManager.cpp", + "src/runtime/OffsetMemoryPool.cpp", + "src/runtime/OperatorTensor.cpp", + "src/runtime/PoolManager.cpp", + "src/runtime/RuntimeContext.cpp", + "src/runtime/Scheduler.cpp", + "src/runtime/SchedulerFactory.cpp", + "src/runtime/SchedulerUtils.cpp", + "src/runtime/SubTensor.cpp", + "src/runtime/Tensor.cpp", + "src/runtime/TensorAllocator.cpp", + "src/runtime/Utils.cpp", + "src/runtime/CPP/ICPPSimpleFunction.cpp", + "src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp", + "src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp", + "src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp", + "src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp", + "src/runtime/CPP/functions/CPPPermute.cpp", + "src/runtime/CPP/functions/CPPTopKV.cpp", + "src/runtime/CPP/functions/CPPUpsample.cpp" ], + "logging": [ + "src/core/utils/logging/FilePrinter.cpp", + "src/core/utils/logging/Helpers.cpp", + "src/core/utils/logging/Logger.cpp", + "src/core/utils/logging/LoggerRegistry.cpp" + ], + "scheduler": { + "single": [ "src/runtime/CPP/SingleThreadScheduler.cpp" ], + "threads": [ "src/runtime/CPP/CPPScheduler.cpp" ], + "omp": [ "src/runtime/OMP/OMPScheduler.cpp"] + }, "c_api": { "common": [ "src/c/AclContext.cpp", @@ -28,6 +100,14 @@ "src/c/operators/AclActivation.cpp" ] }, + "high_priority": [ + "Activation", + "DepthwiseConv2d", + "Conv2d", + "Permute", + "Pool2d", + "Reshape" + ], "gpu": { "common": [ "src/core/CL/CLCompileContext.cpp", @@ -41,19 +121,11 @@ "src/core/CL/ICLSimpleKernel.cpp", "src/core/CL/ICLTensor.cpp", "src/core/CL/OpenCL.cpp", - "src/gpu/cl/ClKernelLibrary.cpp", - "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp", - "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp", - "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp", - "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp", - "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp", - "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp", - "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp", - "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp", - "src/core/CL/kernels/CLFillBorderKernel.cpp", "src/gpu/cl/ClContext.cpp", + "src/gpu/cl/ClKernelLibrary.cpp", "src/gpu/cl/ClQueue.cpp", "src/gpu/cl/ClTensor.cpp", + "src/core/CL/kernels/CLFillBorderKernel.cpp", "src/runtime/CL/CLBufferAllocator.cpp", "src/runtime/CL/CLGEMMHeuristicsHandle.cpp", "src/runtime/CL/CLHelpers.cpp", @@ -68,888 +140,1022 @@ "src/runtime/CL/CLTuner.cpp", "src/runtime/CL/ICLSimpleFunction.cpp", "src/runtime/CL/Utils.cpp", - "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp", - "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp", - "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp", - "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp", "src/runtime/CL/mlgo/HeuristicTree.cpp", "src/runtime/CL/mlgo/MLGOHeuristics.cpp", "src/runtime/CL/mlgo/MLGOParser.cpp", "src/runtime/CL/mlgo/Utils.cpp", "src/runtime/CL/tuners/CLTuningParametersList.cpp" ], - "high_priority": [ - "Activation", - "DepthwiseConv2d", - "DirectConv2d", - "Permute", - "Pool2d", - "Reshape" - ], "operators": { - "Activation": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClActivation.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClActivationKernel.cpp" - ] - } - }, - "Add": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClAdd.cpp" - ] - } - }, - "Cast": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClCast.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClCastKernel.cpp" - ] - } - }, - "Concatenate": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClConcatenate.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp", - "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp", - "src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp", - "src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp", - "src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp", - "src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp" - ] - } - }, - "DirectConv2d": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClDirectConv2d.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClDirectConv2dKernel.cpp" - ] - } - }, - "FullyConnected": { - "deps": [ - "ClFlatten", - "ClConvertFullyConnectedWeights", - "ClGemm", - "ClGemmLowpMatrixMultiplyCore", - "ClTranspose" - ], - "files": { - "operator": [ - "src/gpu/cl/operators/ClFullyConnected.cpp" - ] - } - }, - "ConvertFullyConnectedWeights": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp" - ] - } - }, - "Permute": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClPermute.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClPermuteKernel.cpp" - ] - } - }, - "Pool2d": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClPool2d.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClPool2dKernel.cpp" - ] - } - }, - "Conv2d": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClConv2d.cpp" - ] - } - }, - "PRelu": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClPRelu.cpp" - ] - } - }, - "Reshape": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClReshape.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClReshapeKernel.cpp" - ] - } - }, - "Copy": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClCopy.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClCopyKernel.cpp" - ] - } - }, - "Crop": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClCrop.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClCropKernel.cpp" - ] - } - }, - "Dequantize": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClDequantize.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClDequantizeKernel.cpp" - ] - } - }, - "Elementwise": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClElementwiseOperations.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClElementwiseKernel.cpp" - ] - } - }, - "ElementwiseUnary": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClElementwiseUnary.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp" - ] - } - }, - "Fill": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClFill.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClFillKernel.cpp" - ] - } - }, - "Flatten": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClFlatten.cpp" - ] - } - }, - "Floor": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClFloor.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClFloorKernel.cpp" - ] - } - }, - "GEMM": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClGemm.cpp", - "src/gpu/cl/operators/ClGemmConv2d.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp", - "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp", - "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp", - "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp", - "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp", - "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp" - ] - } - }, - "GEMMLowp": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp", - "src/gpu/cl/operators/ClGemmLowpOutputStage.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp", - "src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp" - ] - } - }, - "Mul": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClMul.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClMulKernel.cpp" - ] - } - }, - "Quantize": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClQuantize.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClQuantizeKernel.cpp" - ] - } - }, - "Scale": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClScale.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClScaleKernel.cpp" - ] - } - }, - "Softmax": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClSoftmax.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClSoftmaxKernel.cpp" - ] - } - }, - "Sub": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClSub.cpp" - ] - } - }, - "Transpose": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClTranspose.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClTransposeKernel.cpp" - ] - } - }, - "GenerateProposals": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp" - ] - } - }, - "ArgMinMax": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp" - ] - } - }, - "BatchNormalization": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp" - ] - } - }, - "BatchToSpace": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp" - ] - } - }, - "Bitwise": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLBitwiseKernel.cpp" - ] - } - }, - "BoundingBoxTransform": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp" - ] - } - }, - "ChannelShuffleLayer": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp" - ] - } - }, - "GEMMConv2d": { - "files": { - "kernel": [ - "src/gpu/cl/kernels/ClCol2ImKernel.cpp", - "src/gpu/cl/kernels/ClIm2ColKernel.cpp" - ] - } - }, - "Comparison": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLComparisonKernel.cpp" - ] - } - }, - "DeconvolutionLayerUpsample": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp" - ] - } - }, - "DeconvolutionReshapeOutput": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp" - ] - } - }, - "DepthToSpace": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp" - ] - } - }, - "DepthwiseConvolutionLayerNative": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp" - ] - } - }, - "FFTDigitReverse": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp" - ] - } - }, - "FFTRadixStage": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLFFTRadixStageKernel.cpp" - ] - } - }, - "FFTScale": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLFFTScaleKernel.cpp" - ] - } - }, - "FuseBatchNormalization": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp" - ] - } - }, - "Gather": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLGatherKernel.cpp" - ] - } - }, - "InstanceNormalization": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp" - ] - } - }, - "L2Normalize": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp" - ] - } - }, - "LogicalNot": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClLogicalNot.cpp" - ] - } - }, - "MaxUnpooling": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp" - ] - } - }, - "MeanStdDevNormalization": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp" - ] - } - }, - "MinMax": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLMinMaxLayerKernel.cpp" - ] - } - }, - "Normalization": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLNormalizationLayerKernel.cpp" - ] - } - }, - "NormalizePlanarYUV": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp" - ] - } - }, - "Pad": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLPadLayerKernel.cpp" - ] - } - }, - "PriorBox": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp" - ] - } - }, - "QLSTMLayerNormalization": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp" - ] - } - }, - "Range": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLRangeKernel.cpp" - ] - } - }, - "ReductionOperation": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLReductionOperationKernel.cpp" - ] - } - }, - "Remap": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLRemapKernel.cpp" - ] - } - }, - "Reorg": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLReorgLayerKernel.cpp" - ] - } - }, - "Reverse": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLReverseKernel.cpp" - ] - } - }, - "ROIAlign": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLROIAlignLayerKernel.cpp" - ] - } - }, - "ROIPooling": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp" - ] - } - }, - "Select": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLSelectKernel.cpp" - ] - } - }, - "SpaceToBatch": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp" - ] - } - }, - "SpaceToDepth": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp" - ] - } - }, - "Stack": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLStackLayerKernel.cpp" - ] - } - }, - "StridedSlice": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLStridedSliceKernel.cpp" - ] - } - }, - "Tile": { - "files": { - "kernel": [ - "src/core/CL/kernels/CLTileKernel.cpp" - ] - } - }, - "WeightsReshape": { - "files": { - "kernel": [ - "src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp" - ] - } - }, - "WinogradConv2d": { - "files": { - "operator": [ - "src/gpu/cl/operators/ClWinogradConv2d.cpp" - ], - "kernel": [ - "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp", - "src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp", - "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp" - ] - } + "Activation":{ + "files": { + "common": [ + "src/gpu/cl/kernels/ClActivationKernel.cpp", + "src/gpu/cl/operators/ClActivation.cpp", + "src/runtime/CL/functions/CLActivationLayer.cpp" + ] + } + }, + "ArgMinMax": { + "deps": [ "Reshape" ], + "files": { + "common": [ + "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp", + "src/runtime/CL/functions/CLArgMinMaxLayer.cpp" + ] + } + }, + "Add": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClElementwiseKernel.cpp", + "src/gpu/cl/operators/ClAdd.cpp" + ] + } + }, + "BatchNormalization": { + "files": { + "common": [ + "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp", + "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp", + "src/runtime/CL/functions/CLBatchNormalizationLayer.cpp", + "src/runtime/CL/functions/CLFuseBatchNormalization.cpp" + ] + } + }, + "BatchToSpace": { + "files": { + "common": [ + "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp", + "src/runtime/CL/functions/CLBatchToSpaceLayer.cpp" + ] + } + }, + "Bitwise": { + "files": { + "common": [ "src/core/CL/kernels/CLBitwiseKernel.cpp" ] + } + }, + "BitwiseAnd": { + "deps": [ "Bitwise" ], + "files": { + "common": [ "src/runtime/CL/functions/CLBitwiseAnd.cpp" ] + } + }, + "BitwiseNot": { + "deps": [ "Bitwise" ], + "files": { + "common": [ "src/runtime/CL/functions/CLBitwiseNot.cpp" ] + } + }, + "BitwiseOr": { + "deps": [ "Bitwise" ], + "files": { + "common": [ "src/runtime/CL/functions/CLBitwiseOr.cpp" ] + } + }, + "BitwiseXor": { + "deps": [ "Bitwise" ], + "files": { + "common": [ "src/runtime/CL/functions/CLBitwiseXor.cpp" ] + } + }, + "BoundingBoxTransform": { + "files": { + "common": [ + "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp", + "src/runtime/CL/functions/CLBoundingBoxTransform.cpp" + ] + } + }, + "Cast": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClCastKernel.cpp", + "src/gpu/cl/operators/ClCast.cpp", + "src/runtime/CL/functions/CLCast.cpp" + ] + } + }, + "ChannelShuffle": { + "files": { + "common": [ + "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp", + "src/runtime/CL/functions/CLChannelShuffleLayer.cpp" + ] + } + }, + "Comparison": { + "files": { + "common": [ + "src/core/CL/kernels/CLComparisonKernel.cpp", + "src/runtime/CL/functions/CLComparison.cpp" + ] + } + }, + "Concatenate": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp", + "src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp", + "src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp", + "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp", + "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp", + "src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp", + "src/gpu/cl/operators/ClConcatenate.cpp", + "src/runtime/CL/functions/CLConcatenateLayer.cpp" + ] + } + }, + "Conv2d": { + "deps": [ + "Activation", + "ElementwiseBinary", + "FFT2D", + "Gemm", + "Mul", + "Pad", + "Permute", + "Reduction", + "Reshape", + "Reverse", + "Slice" + ], + "files": { + "common": [ + "src/gpu/cl/kernels/ClDirectConv2dKernel.cpp", + "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp", + "src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp", + "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp", + "src/gpu/cl/kernels/ClIm2ColKernel.cpp", + "src/gpu/cl/kernels/ClCol2ImKernel.cpp", + "src/gpu/cl/operators/ClConv2d.cpp", + "src/gpu/cl/operators/ClDirectConv2d.cpp", + "src/gpu/cl/operators/ClGemmConv2d.cpp", + "src/gpu/cl/operators/ClWinogradConv2d.cpp", + "src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp", + "src/runtime/CL/functions/CLConvolutionLayer.cpp", + "src/runtime/CL/functions/CLDirectConvolutionLayer.cpp", + "src/runtime/CL/functions/CLFFTConvolutionLayer.cpp", + "src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp", + "src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp" + ] + } + }, + "Copy": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClCopyKernel.cpp", + "src/gpu/cl/operators/ClCopy.cpp", + "src/runtime/CL/functions/CLCopy.cpp" + ] + } + }, + "CropResize": { + "deps": [ "Copy", "Fill", "Scale" ], + "files": { + "common": [ + "src/gpu/cl/kernels/ClCropKernel.cpp", + "src/gpu/cl/operators/ClCrop.cpp", + "src/runtime/CL/functions/CLCrop.cpp", + "src/runtime/CL/functions/CLCropResize.cpp" + ] + } + }, + "Deconv2d": { + "deps": [ "Conv2d", "Reverse", "Transpose"], + "files": { + "common": [ + "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp", + "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp", + "src/runtime/CL/functions/CLDeconvolutionLayer.cpp", + "src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp", + "src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp", + "src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp" + ] + } + }, + "DepthConvert": { + "deps": [ "Cast"], + "files": { + "common": [ "src/runtime/CL/functions/CLDepthConvertLayer.cpp" ] + } + }, + "DepthToSpace": { + "files": { + "common": [ + "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp", + "src/runtime/CL/functions/CLDepthToSpaceLayer.cpp" + ] + } + }, + "DepthwiseConv2d": { + "deps": [ "Permute" ], + "files": { + "common": [ + "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp", + "src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp" + ] + } + }, + "Dequantize": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClDequantizeKernel.cpp", + "src/gpu/cl/operators/ClDequantize.cpp", + "src/runtime/CL/functions/CLDequantizationLayer.cpp" + ] + } + }, + "ElementwiseBinary": { + "deps": ["Add", "Sub"], + "files": { + "common": [ + "src/gpu/cl/kernels/ClElementwiseKernel.cpp", + "src/gpu/cl/operators/ClElementwiseOperations.cpp", + "src/runtime/CL/functions/CLElementwiseOperations.cpp" + ] + } + }, + "ElementwiseUnary":{ + "files": { + "common": [ + "src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp", + "src/gpu/cl/operators/ClElementwiseUnary.cpp", + "src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp" + ] + } + }, + "FFT1D": { + "files": { + "common": [ + "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp", + "src/core/CL/kernels/CLFFTRadixStageKernel.cpp", + "src/core/CL/kernels/CLFFTScaleKernel.cpp", + "src/runtime/CL/functions/CLFFT1D.cpp" + ] + } + }, + "FFT2D": { + "deps": [ "FFT1D" ], + "files": { + "common": [ "src/runtime/CL/functions/CLFFT2D.cpp" ] + } + }, + "Fill": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClFillKernel.cpp", + "src/gpu/cl/operators/ClFill.cpp", + "src/runtime/CL/functions/CLFill.cpp" + ] + } + }, + "Flatten": { + "files": { + "common": [ + "src/gpu/cl/operators/ClFlatten.cpp", + "src/runtime/CL/functions/CLFlattenLayer.cpp" + ] + } + }, + "Floor": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClFloorKernel.cpp", + "src/gpu/cl/operators/ClFloor.cpp", + "src/runtime/CL/functions/CLFloor.cpp" + ] + } + }, + "FullyConnected": { + "deps": [ "Flatten", "Gemm", "Transpose"], + "files": { + "common": [ + "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp", + "src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp", + "src/gpu/cl/operators/ClFullyConnected.cpp", + "src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp", + "src/runtime/CL/functions/CLFullyConnectedLayer.cpp" + ] + } + }, + "Gather": { + "files": { + "common": [ + "src/core/CL/kernels/CLGatherKernel.cpp", + "src/runtime/CL/functions/CLGather.cpp"] + } + }, + "Gemm": { + "deps": [ "Cast" ], + "files": { + "common": [ + "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp", + "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp", + "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp", + "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp", + "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp", + "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp", + "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp", + "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp", + "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp", + "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp", + "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp", + "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp", + "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp", + "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp", + "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp", + "src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp", + "src/gpu/cl/operators/ClGemm.cpp", + "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp", + "src/gpu/cl/operators/ClGemmLowpOutputStage.cpp", + "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp", + "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp", + "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp", + "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp", + "src/runtime/CL/functions/CLGEMM.cpp", + "src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp", + "src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp" + ] + } + }, + "GenerateProposals": { + "deps": [ "BoundingBoxTransform", "Dequantize", "Pad", "Permute", "Quantize", "Reshape" ], + "files": { + "common": [ + "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp", + "src/runtime/CL/functions/CLGenerateProposalsLayer.cpp" + ] + } + }, + "InstanceNormalize": { + "files": { + "common": [ + "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp", + "src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp" + ] + } + }, + "L2Normalize": { + "deps": [ "Reduction" ], + "files": { + "common": [ + "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp", + "src/runtime/CL/functions/CLL2NormalizeLayer.cpp" + ] + } + }, + "Logical": { + "files": { + "common": [ + "src/gpu/cl/operators/ClLogicalNot.cpp", + "src/runtime/CL/functions/CLLogicalAnd.cpp", + "src/runtime/CL/functions/CLLogicalNot.cpp", + "src/runtime/CL/functions/CLLogicalOr.cpp" + ] + } + }, + "LSTM": { + "deps": [ + "Activation", + "Concatenate", + "Copy", + "Dequantize", + "ElementwiseBinary", + "Fill", + "FullyConnected", + "Gemm", + "MeanStdDevNormalize", + "Mul", + "Quantize", + "Slice", + "Transpose" + ], + "files": { + "common": [ + "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp", + "src/runtime/CL/functions/CLQLSTMLayer.cpp", + "src/runtime/CL/functions/CLLSTMLayer.cpp", + "src/runtime/CL/functions/CLLSTMLayerQuantized.cpp" + ] + } + }, + "MaxUnpool2d": { + "deps": [ "Fill" ], + "files": { + "common": [ + "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp", + "src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp" + ] + } + }, + "MeanStdDevNormalize": { + "deps": [ "Reduction" ], + "files": { + "common": [ + "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp", + "src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp", + "src/runtime/CL/functions/CLReduceMean.cpp" + ] + } + }, + "Mul": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClMulKernel.cpp", + "src/gpu/cl/operators/ClMul.cpp", + "src/runtime/CL/functions/CLPixelWiseMultiplication.cpp" + ] + } + }, + "Normalize": { + "files": { + "common": [ + "src/core/CL/kernels/CLNormalizationLayerKernel.cpp", + "src/runtime/CL/functions/CLNormalizationLayer.cpp" + ] + } + }, + "Pad": { + "deps": [ "Copy" ], + "files": { + "common": [ + "src/core/CL/kernels/CLPadLayerKernel.cpp", + "src/runtime/CL/functions/CLPadLayer.cpp" + ] + } + }, + "Permute": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClPermuteKernel.cpp", + "src/gpu/cl/operators/ClPermute.cpp", + "src/runtime/CL/functions/CLPermute.cpp" + ] + } + }, + "Pool2d": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClPool2dKernel.cpp", + "src/gpu/cl/operators/ClPool2d.cpp", + "src/runtime/CL/functions/CLPoolingLayer.cpp" + ] + } + }, + "PRelu": { + "deps": [ "ElementwiseBinary" ], + "files": { + "common": [ + "src/gpu/cl/operators/ClPRelu.cpp", + "src/runtime/CL/functions/CLPReluLayer.cpp" + ] + } + }, + "PriorBox": { + "files": { + "common": [ + "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp", + "src/runtime/CL/functions/CLPriorBoxLayer.cpp" + ] + } + }, + "Quantize": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClQuantizeKernel.cpp", + "src/gpu/cl/operators/ClQuantize.cpp", + "src/runtime/CL/functions/CLQuantizationLayer.cpp" + ] + } + }, + "Range": { + "files": { + "common": [ + "src/core/CL/kernels/CLRangeKernel.cpp", + "src/runtime/CL/functions/CLRange.cpp" + ] + } + }, + "Reduction": { + "deps": [ "Reshape" ], + "files": { + "common": [ + "src/core/CL/kernels/CLReductionOperationKernel.cpp", + "src/runtime/CL/functions/CLReductionOperation.cpp" + ] + } + }, + "Remap": { + "files": { + "common": [ + "src/core/CL/kernels/CLRemapKernel.cpp", + "src/runtime/CL/functions/CLRemap.cpp"] + } + }, + "Reorg": { + "files": { + "common": [ + "src/core/CL/kernels/CLReorgLayerKernel.cpp", + "src/runtime/CL/functions/CLReorgLayer.cpp" + ] + } + }, + "Reshape": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClReshapeKernel.cpp", + "src/gpu/cl/operators/ClReshape.cpp", + "src/runtime/CL/functions/CLReshapeLayer.cpp" + ] + } + }, + "Reverse": { + "files": { + "common": [ + "src/core/CL/kernels/CLReverseKernel.cpp", + "src/runtime/CL/functions/CLReverse.cpp" + ] + } + }, + "RNN": { + "deps": [ "Activation", "Cast", "ElementwiseBinary", "FullyConnected", "Gemm"], + "files": { + "common": [ "src/runtime/CL/functions/CLRNNLayer.cpp" ] + } + }, + "ROIAlign": { + "files": { + "common": [ + "src/core/CL/kernels/CLROIAlignLayerKernel.cpp", + "src/runtime/CL/functions/CLROIAlignLayer.cpp" + ] + } + }, + "ROIPool2d": { + "files": { + "common": [ + "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp", + "src/runtime/CL/functions/CLROIPoolingLayer.cpp" + ] + } + }, + "Scale": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClScaleKernel.cpp", + "src/gpu/cl/operators/ClScale.cpp", + "src/runtime/CL/functions/CLScale.cpp" + ] + } + }, + "Select": { + "files": { + "common": [ + "src/core/CL/kernels/CLSelectKernel.cpp", + "src/runtime/CL/functions/CLSelect.cpp" + ] + } + }, + "Slice": { + "deps": [ "StridedSlice" ], + "files": { + "common": [ "src/runtime/CL/functions/CLSlice.cpp" ] + } + }, + "Softmax": { + "deps": [ "Permute" ], + "files": { + "common": [ + "src/gpu/cl/kernels/ClSoftmaxKernel.cpp", + "src/gpu/cl/operators/ClSoftmax.cpp", + "src/runtime/CL/functions/CLSoftmaxLayer.cpp" + ] + } + }, + "SpaceToBatch": { + "files": { + "common": [ + "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp", + "src/runtime/CL/functions/CLSpaceToBatchLayer.cpp" + ] + } + }, + "SpaceToDepth": { + "files": { + "common": [ + "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp", + "src/runtime/CL/functions/CLSpaceToDepthLayer.cpp" + ] + } + }, + "Split": { + "deps": [ "StridedSlice" ], + "files": { + "common": [ "src/runtime/CL/functions/CLSplit.cpp" ] + } + }, + "Stack": { + "files": { + "common": [ + "src/core/CL/kernels/CLStackLayerKernel.cpp", + "src/runtime/CL/functions/CLStackLayer.cpp" + ] + } + }, + "StridedSlice": { + "files": { + "common": [ + "src/core/CL/kernels/CLStridedSliceKernel.cpp", + "src/runtime/CL/functions/CLStridedSlice.cpp" + ] + } + }, + "Sub": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClElementwiseKernel.cpp", + "src/gpu/cl/operators/ClSub.cpp" + ] + } + }, + "Tile": { + "files": { + "common": [ + "src/core/CL/kernels/CLTileKernel.cpp", + "src/runtime/CL/functions/CLTile.cpp" + ] + } + }, + "Transpose": { + "files": { + "common": [ + "src/gpu/cl/kernels/ClTransposeKernel.cpp", + "src/gpu/cl/operators/ClTranspose.cpp", + "src/runtime/CL/functions/CLTranspose.cpp" + ] + } + }, + "Unstack": { + "deps": [ "StridedSlice" ], + "files": { + "common": [ "src/runtime/CL/functions/CLUnstack.cpp" ] + } + }, + "YUVNormalize": { + "files": { + "common": [ + "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp", + "src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp" + ] } } - }, + } +}, "cpu": { "common": [ "src/cpu/CpuContext.cpp", "src/cpu/CpuQueue.cpp", - "src/cpu/CpuTensor.cpp" - ], - "high_priority": [ - "Activation", - "DepthwiseConv2d", - "DirectConv2d", - "Permute", - "Pool2d", - "Reshape", - "FillBorder" + "src/cpu/CpuTensor.cpp", + "src/core/NEON/kernels/NEFillBorderKernel.cpp", + "src/runtime/NEON/INEOperator.cpp", + "src/runtime/NEON/INESimpleFunction.cpp", + "src/runtime/NEON/INESimpleFunctionNoBorder.cpp" ], "operators": { "Activation": { "files": { - "operator": [ - "src/cpu/operators/CpuActivation.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuActivationKernel.cpp" + "common": [ + "src/cpu/operators/CpuActivation.cpp", + "src/cpu/kernels/CpuActivationKernel.cpp", + "src/runtime/NEON/functions/NEActivationLayer.cpp" ], - "sve": { - "fp32": [ - "src/cpu/kernels/activation/sve/fp32.cpp" - ], - "fp16": [ - "src/cpu/kernels/activation/sve/fp16.cpp" - ], - "qsymm16": [ - "src/cpu/kernels/activation/sve/qsymm16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/activation/sve/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" - ] - }, "neon": { - "fp32": [ - "src/cpu/kernels/activation/neon/fp32.cpp" - ], - "fp16": [ - "src/cpu/kernels/activation/neon/fp16.cpp" - ], - "qsymm16": [ - "src/cpu/kernels/activation/neon/qsymm16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/activation/neon/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/activation/neon/qasymm8_signed.cpp" - ] + "fp16": [ "src/cpu/kernels/activation/neon/fp16.cpp" ], + "fp32": [ "src/cpu/kernels/activation/neon/fp32.cpp" ], + "qasymm8": [ "src/cpu/kernels/activation/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/activation/neon/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/activation/neon/qsymm16.cpp" ] + }, + "sve": { + "fp16": [ "src/cpu/kernels/activation/sve/fp16.cpp" ], + "fp32": [ "src/cpu/kernels/activation/sve/fp32.cpp" ], + "qasymm8": [ "src/cpu/kernels/activation/neon/qasymm8.cpp", "src/cpu/kernels/activation/sve/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/activation/neon/qasymm8_signed.cpp", "src/cpu/kernels/activation/sve/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/activation/neon/qsymm16.cpp", "src/cpu/kernels/activation/sve/qsymm16.cpp" ] } } }, + "ArgMinMax": { + "deps": [ "Reduction" ], + "files": { + "common": [ "src/runtime/NEON/functions/NEArgMinMaxLayer.cpp" ] + } + }, "Add": { "files": { - "operator": [ - "src/cpu/operators/CpuAdd.cpp" + "common": [ + "src/cpu/operators/CpuAdd.cpp", + "src/cpu/kernels/CpuAddKernel.cpp", + "src/runtime/NEON/functions/NEArithmeticAddition.cpp" ], - "kernel": [ - "src/cpu/kernels/CpuAddKernel.cpp" - ], - "sve": { - "all": [ - "src/cpu/kernels/add/sve/impl.cpp" - ], - "qsymm16": [ - "src/cpu/kernels/add/sve/qsymm16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/add/sve/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/add/sve/qasymm8_signed.cpp" - ] - }, "neon": { - "qsymm16": [ - "src/cpu/kernels/add/neon/qsymm16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/add/neon/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/add/neon/qasymm8_signed.cpp" - ] + "qasymm8": [ "src/cpu/kernels/add/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/add/neon/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/add/neon/qsymm16.cpp" ] + }, + "sve": { + "common": [ "src/cpu/kernels/add/sve/impl.cpp" ], + "qasymm8": [ "src/cpu/kernels/add/neon/qasymm8.cpp", "src/cpu/kernels/add/sve/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/add/neon/qasymm8_signed.cpp", "src/cpu/kernels/add/sve/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/add/neon/qsymm16.cpp", "src/cpu/kernels/add/sve/qsymm16.cpp" ] } } }, - "BatchNorm": { + "BatchNormalize": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp", + "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp", + "src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp", + "src/runtime/NEON/functions/NEFuseBatchNormalization.cpp" ], - "sve": { - "fp32": [ - "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp" - ], - "fp16": [ - "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp" - ] - }, "neon": { - "fp32": [ - "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp" - ], - "fp16": [ - "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp" - ] + "fp16": [ "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp" ], + "fp32": [ "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp" ] + }, + "sve": { + "fp16": [ "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp" ], + "fp32": [ "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp" ] } } }, "BatchToSpace": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp", + "src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp" ] } }, "BitwiseAnd": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBitwiseAndKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBitwiseAndKernel.cpp", + "src/runtime/NEON/functions/NEBitwiseAnd.cpp" ] } }, "BitwiseNot": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBitwiseNotKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBitwiseNotKernel.cpp", + "src/runtime/NEON/functions/NEBitwiseNot.cpp" ] } }, "BitwiseOr": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBitwiseOrKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBitwiseOrKernel.cpp", + "src/runtime/NEON/functions/NEBitwiseOr.cpp" ] } }, "BitwiseXor": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBitwiseXorKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBitwiseXorKernel.cpp", + "src/runtime/NEON/functions/NEBitwiseXor.cpp" ] } }, "BoundingBoxTransform": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp" - ] - } - }, - "ChannelShuffleLayer": { - "files": { - "kernel": [ - "src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp", + "src/runtime/NEON/functions/NEBoundingBoxTransform.cpp" ] } }, - "Col2Im": { + "Cast": { "files": { - "kernel": [ - "src/cpu/kernels/CpuCol2ImKernel.cpp" + "common": [ + "src/cpu/operators/CpuCast.cpp", + "src/cpu/kernels/CpuCastKernel.cpp", + "src/runtime/NEON/functions/NECast.cpp" ] } }, - "Cast": { + "ChannelShuffle": { "files": { - "operator": [ - "src/cpu/operators/CpuCast.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuCastKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp", + "src/runtime/NEON/functions/NEChannelShuffleLayer.cpp" ] } }, "Concatenate": { "files": { - "operator": [ - "src/cpu/operators/CpuConcatenate.cpp" - ], - "kernel": [ + "common": [ + "src/cpu/operators/CpuConcatenate.cpp", "src/cpu/kernels/CpuConcatenateWidthKernel.cpp", "src/cpu/kernels/CpuConcatenateBatchKernel.cpp", "src/cpu/kernels/CpuConcatenateDepthKernel.cpp", - "src/cpu/kernels/CpuConcatenateHeightKernel.cpp" + "src/cpu/kernels/CpuConcatenateHeightKernel.cpp", + "src/runtime/NEON/functions/NEConcatenateLayer.cpp" ] } }, - "ConvertFullyConnectedWeights": { + "Conv2d": { + "deps": [ + "Activation", + "ElementwiseBinary", + "FFT2D", + "Gemm", + "Mul", + "Pad", + "Permute", + "Reshape", + "Reverse", + "Slice" + ], "files": { - "operator": [ - "src/cpu/operators/CpuConvertFullyConnectedWeights.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp" + "common": [ + "src/cpu/operators/CpuConv2d.cpp", + "src/cpu/operators/CpuDirectConv2d.cpp", + "src/cpu/operators/CpuGemmDirectConv2d.cpp", + "src/cpu/operators/CpuGemmConv2d.cpp", + "src/cpu/operators/CpuWinogradConv2d.cpp", + "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp", + "src/cpu/kernels/CpuDirectConv2dKernel.cpp", + "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp", + "src/cpu/kernels/CpuWinogradConv2dKernel.cpp", + "src/cpu/kernels/CpuCol2ImKernel.cpp", + "src/cpu/kernels/CpuIm2ColKernel.cpp", + "src/cpu/kernels/CpuWeightsReshapeKernel.cpp", + "src/core/NEON/kernels/convolution/common/padding.cpp", + "src/core/NEON/kernels/convolution/common/qasymm8.cpp", + "src/core/NEON/kernels/convolution/common/qsymm8.cpp", + "src/core/NEON/kernels/convolution/common/utils.cpp", + "src/core/NEON/kernels/convolution/winograd/padding.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp", + "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp", + "src/runtime/NEON/functions/NEConvolutionLayer.cpp", + "src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp", + "src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp", + "src/runtime/NEON/functions/NEGEMMConv2d.cpp", + "src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp", + "src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp" ] } }, - "ConvertQuantizedSignedness": { + "Copy": { "files": { - "kernel": [ - "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp" + "common": [ + "src/cpu/operators/CpuCopy.cpp", + "src/cpu/kernels/CpuCopyKernel.cpp", + "src/runtime/NEON/functions/NECopy.cpp" ] } }, - "Convolution": { + "CropResize": { + "deps": [ "Scale" ], "files": { - "operator": [ - "src/cpu/operators/CpuConv2d.cpp" + "common": [ + "src/core/NEON/kernels/NECropKernel.cpp", + "src/runtime/NEON/functions/NECropResize.cpp" ] } }, - "Copy": { + "Deconv2d": { + "deps": [ "Conv2d", "Reverse", "Transpose"], "files": { - "operator": [ - "src/cpu/operators/CpuCopy.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuCopyKernel.cpp" + "common": [ + "src/runtime/NEON/functions/NEDeconvolutionLayer.cpp" + ] + } + }, + "DepthConvert": { + "deps": [ "Cast"], + "files": { + "common": [ + "src/runtime/NEON/functions/NEDepthConvertLayer.cpp" ] } }, - "Crop": { + "DepthToSpace": { "files": { - "kernel": [ - "src/core/NEON/kernels/NECropKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp", + "src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp" ] } }, "DepthwiseConv2d": { - "deps": [ - "Activation", - "Permute" - ], + "deps": [ "Activation", "Permute" ], "files": { - "operator": [ + "common": [ "src/cpu/operators/CpuDepthwiseConv2d.cpp", "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp", - "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp" - ], - "kernel": [ + "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp", + "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp", "src/core/NEON/kernels/convolution/common/padding.cpp", "src/core/NEON/kernels/convolution/common/qasymm8.cpp", "src/core/NEON/kernels/convolution/common/qsymm8.cpp", "src/core/NEON/kernels/convolution/common/utils.cpp", - "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp" + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp", + "src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp" ], + "neon": { + "estate64": [ + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp" + ] + }, "sve": { - "all": [ + "common": [ + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_8b_mla.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp", @@ -994,17 +1200,7 @@ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp" - ] - }, - "neon": { - "estate64": [ - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", @@ -1059,203 +1255,135 @@ } } }, - "DepthToSpaceLayer": { - "files": { - "kernel": [ - "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp" - ] - } - }, "Dequantize": { "files": { - "operator": [ - "src/cpu/operators/CpuDequantize.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuDequantizeKernel.cpp" + "common": [ + "src/cpu/operators/CpuDequantize.cpp", + "src/cpu/kernels/CpuDequantizeKernel.cpp", + "src/runtime/NEON/functions/NEDequantizationLayer.cpp" ] } }, - "DirectConv2d": { - "deps": [ - "Activation", - "FillBorder" - ], + "DetectionPostProcess": { + "deps": [ "Dequantize" ], "files": { - "operator": [ - "src/cpu/operators/CpuDirectConv2d.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuDirectConv2dKernel.cpp", - "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp" - ] + "common" : [ "src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp" ] } }, - "Elementwise": { + "ElementwiseBinary": { "files": { - "operator": [ - "src/cpu/operators/CpuElementwise.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuElementwiseKernel.cpp" + "common": [ + "src/cpu/operators/CpuElementwise.cpp", + "src/cpu/kernels/CpuElementwiseKernel.cpp", + "src/runtime/NEON/functions/NEElementwiseOperations.cpp" ], "sve": { - "all": [ - "src/cpu/kernels/elementwise/sve/elementwise.cpp" - ] + "common": [ "src/cpu/kernels/elementwise/sve/elementwise.cpp" ] } } }, - "ElementwiseUnary": { + "ElementwiseUnary":{ "files": { - "operator": [ - "src/cpu/operators/CpuElementwiseUnary.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp" + "common": [ + "src/cpu/operators/CpuElementwiseUnary.cpp", + "src/cpu/kernels/CpuElementwiseUnaryKernel.cpp", + "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp" ], "sve": { - "all": [ - "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" - ] + "common": [ "src/cpu/kernels/elementwise/sve/elementwise_unary.cpp" ] } } }, "FFT1D": { "files": { - "kernel": [ + "common": [ "src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp", "src/core/NEON/kernels/NEFFTRadixStageKernel.cpp", - "src/core/NEON/kernels/NEFFTScaleKernel.cpp" + "src/core/NEON/kernels/NEFFTScaleKernel.cpp", + "src/runtime/NEON/functions/NEFFT1D.cpp" ] } }, - "FillBorder": { + "FFT2D": { + "deps": [ "FFT1D" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEFillBorderKernel.cpp" + "common": [ + "src/runtime/NEON/functions/NEFFT2D.cpp" ] } }, - "Flatten": { - "deps: ": [ - "Reshape" - ], + "Fill": { "files": { - "operator": [ - "src/cpu/operators/CpuFlatten.cpp" + "common": [ + "src/cpu/operators/CpuFill.cpp", + "src/cpu/kernels/CpuFillKernel.cpp", + "src/runtime/NEON/functions/NEFill.cpp" ] } }, - "Fill": { + "Flatten": { + "deps: ": [ "Reshape" ], "files": { - "operator": [ - "src/cpu/operators/CpuFill.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuFillKernel.cpp" + "common": [ + "src/cpu/operators/CpuFlatten.cpp", + "src/runtime/NEON/functions/NEFlattenLayer.cpp" ] } }, "Floor": { "files": { - "operator": [ - "src/cpu/operators/CpuFloor.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuFloorKernel.cpp" + "common": [ + "src/cpu/operators/CpuFloor.cpp", + "src/cpu/kernels/CpuFloorKernel.cpp", + "src/runtime/NEON/functions/NEFloor.cpp" ], "neon": { - "fp32": [ - "src/cpu/kernels/floor/neon/fp32.cpp" - ], - "fp16": [ - "src/cpu/kernels/floor/neon/fp16.cpp" - ] + "fp32": [ "src/cpu/kernels/floor/neon/fp32.cpp" ], + "fp16": [ "src/cpu/kernels/floor/neon/fp16.cpp" ] } } }, "FullyConnected": { - "deps": [ - "CpuFlatten", - "CpuConvertFullyConnectedWeights", - "CpuGemm", - "CpuGemmLowpMatrixMultiplyCore" - ], - "files": { - "operator": [ - "src/cpu/operators/CpuFullyConnected.cpp" - ] - }, - "kernel": [ - "CpuTransposeKernel" - ] - }, - "FuseBatchNormalization": { - "files": { - "kernel": [ - "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp" - ] - } - }, - "GEMM": { - "files": { - "operator" : ["src/cpu/operators/CpuGemm.cpp"], - "kernel": [ - "src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp", - "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp", - "src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp", - "src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp" - ] - } - }, - "GEMMLowp": { - "deps": [ - "GemmAssemblyDispatch" - ], + "deps": [ "Flatten", "Gemm", "Transpose"], "files": { - "operator" : [ - "src/cpu/operators/CpuGemmLowpOutputStage.cpp", - "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp", - "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp", - "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp", - "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp", - "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp", - "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp", - "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp", - "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp" + "common": [ + "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp", + "src/cpu/operators/CpuConvertFullyConnectedWeights.cpp", + "src/cpu/operators/CpuFullyConnected.cpp", + "src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp", + "src/runtime/NEON/functions/NEFullyConnectedLayer.cpp" ] } }, - "GEMMConvolution": { - "deps": [ - "Activation", - "Col2Im", - "Reshape", - "Im2Col", - "GEMMLowpOffsetContributionOutputStage", - "ConvertQuantizedSignedness" - ], + "Gather": { "files": { - "operator": [ - "src/cpu/operators/CpuGemmConv2d.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuWeightsReshapeKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEGatherKernel.cpp", + "src/runtime/NEON/functions/NEGather.cpp" ] } }, - "GemmAssemblyDispatch": { + "Gemm": { "files": { - "operator": [ - "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp" - ], - "kernel": [ - "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp", + "common": [ + "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp", + "src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp", + "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp", + "src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp", + "src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp", + "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp", + "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp", + "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp", + "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp", + "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp", + "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp", + "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp", + "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp", + "src/cpu/operators/CpuGemm.cpp", + "src/cpu/operators/CpuGemmLowpOutputStage.cpp", + "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp", "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp", + "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp", "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_int8.cpp", @@ -1263,14 +1391,17 @@ "src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp", "src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp", + "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults-fp16.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults.cpp", - "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp", "src/core/NEON/kernels/arm_gemm/misc.cpp", "src/core/NEON/kernels/arm_gemm/quantized.cpp", "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp", "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp", - "src/core/NEON/kernels/arm_gemm/transform.cpp" + "src/core/NEON/kernels/arm_gemm/transform.cpp", + "src/runtime/NEON/functions/NEGEMM.cpp", + "src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp", + "src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp" ], "neon": { "estate32": [ @@ -1344,7 +1475,7 @@ ] }, "sve": { - "all": [ + "common": [ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp", @@ -1384,152 +1515,196 @@ "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp", "src/core/NEON/kernels/arm_gemm/mergeresults-sve.cpp", - "src/core/NEON/kernels/arm_gemm/transform-sve.cpp" + "src/core/NEON/kernels/arm_gemm/transform-sve.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_mmla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_4x24/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32bf16fp32_mmla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_mmla_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_mmla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_mmla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_mmla_4x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_mmla_6x16/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x6/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp" ] } } }, - "GemmDirectConv2d": { - "deps": [ - "Activation", - "GemmAssemblyDispatch", - "Permute" - ], - "files": { - "operator": [ - "src/cpu/operators/CpuGemmDirectConv2d.cpp" - ] - } - }, - "Mul": { - "files": { - "operator": [ - "src/cpu/operators/CpuMul.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuMulKernel.cpp" - ] - } - }, - "Quantize": { - "files": { - "operator": [ - "src/cpu/operators/CpuQuantize.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuQuantizeKernel.cpp" - ] - } - }, - "Reshape": { - "files": { - "operator": [ - "src/cpu/operators/CpuReshape.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuReshapeKernel.cpp" - ] - } - }, - "Gather": { + "GenerateProposals": { + "deps": [ "BoundingBoxTransform", "Dequantize", "Pad", "Permute", "Quantize", "Reshape" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEGatherKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp", + "src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp" ] } }, - "GenerateProposalsLayer": { + "InstanceNormalize": { + "deps": [ "Permute", "Reduction" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp", + "src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp" ] } }, - "Im2Col": { + "L2Normalize": { + "deps": [ "Reduction" ], "files": { - "kernel": [ - "src/cpu/kernels/CpuIm2ColKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp", + "src/runtime/NEON/functions/NEL2NormalizeLayer.cpp" ] } }, - "InstanceNormalization": { + "Logical": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NELogicalKernel.cpp", + "src/runtime/NEON/functions/NELogical.cpp" ] } }, - "L2Normalize": { + "LSTM": { "deps": [ - "Reduction" + "Activation", + "Concatenate", + "Copy", + "Dequantize", + "ElementwiseBinary", + "Fill", + "FullyConnected", + "Gemm", + "MeanStdDevNormalize", + "Mul", + "Quantize", + "Slice", + "Transpose" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp", + "src/runtime/NEON/functions/NELSTMLayer.cpp", + "src/runtime/NEON/functions/NELSTMLayerQuantized.cpp", + "src/runtime/NEON/functions/NEQLSTMLayer.cpp" ] } }, - "Logical": { + "MaxUnpool2d": { + "deps": [ "Fill" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NELogicalKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp", + "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp" ] } }, - "MaxUnpooling": { + "Mean": { + "deps" : [ "Reduction" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp" - ] + "common": [ "src/runtime/NEON/functions/NEReduceMean.cpp" ] } }, - "MeanStdDevNormalization": { + "MeanStdDevNormalize": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp", + "src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp" ] } }, - "MinMax": { + "Mul": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEMinMaxLayerKernel.cpp" + "common": [ + "src/cpu/operators/CpuMul.cpp", + "src/cpu/kernels/CpuMulKernel.cpp", + "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp" ] } }, - "Normalization": { - "deps": [ - "PixelWiseMultiplication" - ], + "Normalize": { + "deps": [ "Mul" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NENormalizationLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NENormalizationLayerKernel.cpp", + "src/runtime/NEON/functions/NENormalizationLayer.cpp" ] } }, "Pad": { + "deps": [ "Concatenate", "Copy", "StridedSlice" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEPadLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEPadLayerKernel.cpp", + "src/runtime/NEON/functions/NEPadLayer.cpp" ] } }, "Permute": { "files": { - "operator": [ - "src/cpu/operators/CpuPermute.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuPermuteKernel.cpp" + "common": [ + "src/cpu/operators/CpuPermute.cpp", + "src/cpu/kernels/CpuPermuteKernel.cpp", + "src/runtime/NEON/functions/NEPermute.cpp" ] } }, "Pool2d": { "files": { - "operator": [ - "src/cpu/operators/CpuPool2d.cpp" - ], - "kernel": [ + "common": [ + "src/cpu/operators/CpuPool2d.cpp", "src/cpu/kernels/CpuPool2dKernel.cpp", "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp", @@ -1538,24 +1713,15 @@ "src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp", - "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp" + "src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp", + "src/runtime/NEON/functions/NEPoolingLayer.cpp" ], "neon": { - "nchw": [ - "src/cpu/kernels/pool2d/neon/nchw/all.cpp" - ], - "fp32": [ - "src/cpu/kernels/pool2d/neon/fp32.cpp" - ], - "fp16": [ - "src/cpu/kernels/pool2d/neon/fp16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/pool2d/neon/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" - ], + "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ], + "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ], + "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ], + "qasymm8": [ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" ], "estate64": [ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp", @@ -1578,15 +1744,17 @@ ] }, "sve": { - "all": [ - "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp", + "qasymm8": [ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp" ], + "common": [ "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp", @@ -1596,239 +1764,258 @@ "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp" + "src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp" ] } } }, + "PRelu": { + "deps": [ "ElementwiseBinary" ], + "files": { + "common": [ + "src/runtime/NEON/functions/NEPReluLayer.cpp" + ] + } + }, "PriorBox": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp", + "src/runtime/NEON/functions/NEPriorBoxLayer.cpp" ] } }, - "QLSTMLayerNormalization": { + "Quantize": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp" + "common": [ + "src/cpu/operators/CpuQuantize.cpp", + "src/cpu/kernels/CpuQuantizeKernel.cpp", + "src/runtime/NEON/functions/NEQuantizationLayer.cpp" ] } }, "Range": { "files": { - "kernel": [ - "src/core/NEON/kernels/NERangeKernel.cpp" + "common": [ + "src/core/NEON/kernels/NERangeKernel.cpp", + "src/runtime/NEON/functions/NERange.cpp" ] } }, - "ReductionOperation": { + "Reduction":{ + "deps": [ "Reshape" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEReductionOperationKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEReductionOperationKernel.cpp", + "src/runtime/NEON/functions/NEReductionOperation.cpp" ] } }, "Remap": { "files": { - "kernel": [ - "src/core/NEON/kernels/NERemapKernel.cpp" + "common": [ + "src/core/NEON/kernels/NERemapKernel.cpp", + "src/runtime/NEON/functions/NERemap.cpp" ] } }, "Reorg": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEReorgLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEReorgLayerKernel.cpp", + "src/runtime/NEON/functions/NEReorgLayer.cpp" + ] + } + }, + "Reshape": { + "files": { + "common": [ + "src/cpu/operators/CpuReshape.cpp", + "src/cpu/kernels/CpuReshapeKernel.cpp", + "src/runtime/NEON/functions/NEReshapeLayer.cpp" ] } }, "Reverse": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEReverseKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEReverseKernel.cpp", + "src/runtime/NEON/functions/NEReverse.cpp" ] } }, + "RNN": { + "deps": [ "Activation", "Add", "FullyConnected", "Gemm"], + "files": { + "common": [ "src/runtime/NEON/functions/NERNNLayer.cpp" ] + } + }, "ROIAlign": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp", + "src/runtime/NEON/functions/NEROIAlignLayer.cpp" ] } }, - "ROIPooling": { + "ROIPool2d": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp", + "src/runtime/NEON/functions/NEROIPoolingLayer.cpp" ] } }, + "Scale": { + "files": { + "common": [ + "src/cpu/operators/CpuScale.cpp", + "src/cpu/kernels/CpuScaleKernel.cpp", + "src/runtime/NEON/functions/NEScale.cpp" + ], + "sve": { + "fp16": [ "src/cpu/kernels/scale/sve/fp16.cpp" ], + "fp32": [ "src/cpu/kernels/scale/sve/fp32.cpp" ], + "integer": [ "src/cpu/kernels/scale/sve/integer.cpp" ], + "qasymm8": [ "src/cpu/kernels/scale/sve/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/scale/sve/qasymm8_signed.cpp" ] + + }, + "neon": { + "fp16": [ "src/cpu/kernels/scale/neon/fp16.cpp" ], + "integer": [ "src/cpu/kernels/scale/neon/integer.cpp" ], + "qasymm8": [ "src/cpu/kernels/scale/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/scale/neon/qasymm8_signed.cpp" ] + } + } + }, "Select": { "files": { - "kernel": [ - "src/core/NEON/kernels/NESelectKernel.cpp" + "common": [ + "src/core/NEON/kernels/NESelectKernel.cpp", + "src/runtime/NEON/functions/NESelect.cpp" ] } }, - "SpaceToBatch": { + "Slice": { + "deps": [ "StridedSlice" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp" - ] + "common": [ "src/runtime/NEON/functions/NESlice.cpp" ] } }, - "SpaceToDepth": { + "Softmax": { + "deps": [ + "Permute" + ], + "files": { + "common": [ + "src/cpu/operators/CpuSoftmax.cpp", + "src/cpu/kernels/CpuSoftmaxKernel.cpp", + "src/runtime/NEON/functions/NESoftmaxLayer.cpp" + ], + "sve": { + "common": [ "src/cpu/kernels/softmax/impl/sve/impl.cpp" ] + } + } + }, + "SpaceToBatch": { "files": { - "kernel": [ - "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp", + "src/runtime/NEON/functions/NESpaceToBatchLayer.cpp" ] } }, - "Stack": { + "SpaceToDepth": { "files": { - "kernel": [ - "src/core/NEON/kernels/NEStackLayerKernel.cpp" + "common": [ + "src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp", + "src/runtime/NEON/functions/NESpaceToDepthLayer.cpp" ] } }, - "StridedSlice": { + "Split": { + "deps": [ "StridedSlice" ], "files": { - "kernel": [ - "src/core/NEON/kernels/NEStridedSliceKernel.cpp" + "common": [ + "src/runtime/NEON/functions/NESplit.cpp" ] } }, - "Scale": { + "Stack": { "files": { - "operator": [ - "src/cpu/operators/CpuScale.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuScaleKernel.cpp" - ], - "sve": { - "fp32": [ - "src/cpu/kernels/scale/sve/fp32.cpp" - ], - "fp16": [ - "src/cpu/kernels/scale/sve/fp16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/scale/sve/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/scale/sve/qasymm8_signed.cpp" - ], - "integer": [ - "src/cpu/kernels/scale/sve/integer.cpp" - ] - }, - "neon": { - "fp16": [ - "src/cpu/kernels/scale/neon/fp16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/scale/neon/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/scale/neon/qasymm8_signed.cpp" - ], - "integer": [ - "src/cpu/kernels/scale/neon/integer.cpp" - ] - } + "common": [ + "src/core/NEON/kernels/NEStackLayerKernel.cpp", + "src/runtime/NEON/functions/NEStackLayer.cpp" + ] } }, - "Softmax": { - "deps": [ - "Permute" - ], + "StridedSlice": { "files": { - "operator": [ - "src/cpu/operators/CpuSoftmax.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuSoftmaxKernel.cpp" - ], - "sve": { - "all": [ - "src/cpu/kernels/softmax/impl/sve/impl.cpp" - ] - } + "common": [ + "src/core/NEON/kernels/NEStridedSliceKernel.cpp", + "src/runtime/NEON/functions/NEStridedSlice.cpp" + ] } }, "Sub": { "files": { - "operator": [ - "src/cpu/operators/CpuSub.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuSubKernel.cpp" + "common": [ + "src/cpu/operators/CpuSub.cpp", + "src/cpu/kernels/CpuSubKernel.cpp", + "src/runtime/NEON/functions/NEArithmeticSubtraction.cpp" ], + "sve": { + "qasymm8": [ "src/cpu/kernels/sub/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/sub/neon/qsymm16.cpp" ] + }, "neon": { - "qsymm16": [ - "src/cpu/kernels/sub/neon/qsymm16.cpp" - ], - "qasymm8": [ - "src/cpu/kernels/sub/neon/qasymm8.cpp" - ], - "qasymm8_signed": [ - "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" - ] + "qasymm8": [ "src/cpu/kernels/sub/neon/qasymm8.cpp" ], + "qasymm8_signed": [ "src/cpu/kernels/sub/neon/qasymm8_signed.cpp" ], + "qsymm16": [ "src/cpu/kernels/sub/neon/qsymm16.cpp" ] } } }, - "Transpose": { + "Tile": { "files": { - "operator": [ - "src/cpu/operators/CpuTranspose.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuTransposeKernel.cpp" + "common": [ + "src/core/NEON/kernels/NETileKernel.cpp", + "src/runtime/NEON/functions/NETile.cpp" ] } }, - "Tile": { + "Transpose": { "files": { - "kernel": [ - "src/core/NEON/kernels/NETileKernel.cpp" + "common": [ + "src/cpu/kernels/CpuTransposeKernel.cpp", + "src/cpu/operators/CpuTranspose.cpp", + "src/runtime/NEON/functions/NETranspose.cpp" ] } }, - "WinogradConvolution": { - "deps": [ - "Activation", - "Permute" - ], + "Unstack": { + "deps": [ "StridedSlice" ], "files": { - "operator": [ - "src/cpu/operators/CpuWinogradConv2d.cpp" - ], - "kernel": [ - "src/cpu/kernels/CpuWinogradConv2dKernel.cpp", - "src/core/NEON/kernels/convolution/winograd/padding.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp", - "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp" - ] + "common": [ "src/runtime/NEON/functions/NEUnstack.cpp" ] } } } diff --git a/src/core/CL/CLKernels.h b/src/core/CL/CLKernels.h index f9d560f1b7..0c295aae6a 100644 --- a/src/core/CL/CLKernels.h +++ b/src/core/CL/CLKernels.h @@ -47,7 +47,6 @@ #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h" #include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h" -#include "src/core/CL/kernels/CLMinMaxLayerKernel.h" #include "src/core/CL/kernels/CLNormalizationLayerKernel.h" #include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h" #include "src/core/CL/kernels/CLPadLayerKernel.h" diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp deleted file mode 100644 index f0202a9c5d..0000000000 --- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/CL/kernels/CLMinMaxLayerKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/StringSupport.h" - -#include - -using namespace arm_compute; -using namespace arm_compute::misc::shape_calculator; - -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3); - - if(output->tensor_shape().total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - TensorShape output_shape = compute_min_max_shape(input); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - } - - return Status{}; -} - -std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - TensorShape output_shape = compute_min_max_shape(input); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output, output_shape, 1, input->data_type()); - - const unsigned int num_elems_processed_per_iteration = 1; - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowStatic output_access(output, 0, 0, 2, output->dimension(1)); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_tuple(err, win); -} -} // namespace - -CLMinMaxLayerKernel::CLMinMaxLayerKernel() - : _input(nullptr), _output(nullptr) -{ - _type = CLKernelType::ELEMENTWISE; -} - -void CLMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output) -{ - configure(CLKernelLibrary::get().get_compile_context(), input, output); -} - -void CLMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); - - _input = input; - _output = output; - - std::set build_opts; - build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0))); - build_opts.emplace("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1))); - build_opts.emplace("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); - - // Create kernel - _kernel = create_kernel(compile_context, "minmax_layer", build_opts); - - auto win_config = validate_and_configure_window(input->info(), output->info()); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - ICLKernel::configure_internal(std::get<1>(win_config)); -} - -Status CLMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); - - return Status{}; -} - -void CLMinMaxLayerKernel::reset(cl::CommandQueue &queue) -{ - _output->map(queue, true); - - Window window_output; - window_output.use_tensor_dimensions(_output->info()->tensor_shape()); - window_output.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator output(_output, window_output); - - // Reset output - execute_window_loop(window_output, [&](const Coordinates &) - { - auto *ptr = reinterpret_cast(output.ptr()); - ptr[0] = std::numeric_limits::max(); - ptr[1] = std::numeric_limits::min(); - }, - output); - - _output->unmap(queue); -} - -void CLMinMaxLayerKernel::run(const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3); - Window slice = window_collapsed.first_slice_window_3D(); - slice.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice.set(Window::DimY, Window::Dimension(0, 1, 1)); - slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); - - do - { - Window output_slice = slice.shift_dimensions(2); - - unsigned int idx = 0; - // Set inputs - add_3D_tensor_argument(idx, _input, slice); - add_1D_tensor_argument(idx, _output, output_slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); -} diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.h b/src/core/CL/kernels/CLMinMaxLayerKernel.h deleted file mode 100644 index aa2ff3f375..0000000000 --- a/src/core/CL/kernels/CLMinMaxLayerKernel.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H -#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H - -#include "src/core/CL/ICLKernel.h" - -namespace arm_compute -{ -class ICLTensor; - -/** Interface for the kernel to perform min max search on a 3D tensor. - */ -class CLMinMaxLayerKernel : public ICLKernel -{ -public: - /** Default constructor */ - CLMinMaxLayerKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete; - /** Allow instances of this class to be moved */ - CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default; - /** Allow instances of this class to be moved */ - CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default; - /** Initialise the kernel's input and output. - * - * @param[in] input Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32. - * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor. - * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32. - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] input Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32. - * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor. - * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32. - */ - void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel - * - * @param[in] input Input tensor info. Data types supported: F32. - * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor. - * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); - - /** Resets global minimum and maximum - * - * @param[in,out] queue Command queue on which to map and unmap the min_max tensor - */ - void reset(cl::CommandQueue &queue); - - // Inherited methods overridden: - void run(const Window &window, cl::CommandQueue &queue) override; - -private: - const ICLTensor *_input; - ICLTensor *_output; -}; -} // namespace arm_compute -#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */ diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h index 6d45a9d80c..af301c8d16 100644 --- a/src/core/NEON/NEKernels.h +++ b/src/core/NEON/NEKernels.h @@ -47,7 +47,6 @@ #include "src/core/NEON/kernels/NELogicalKernel.h" #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h" #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" -#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h" #include "src/core/NEON/kernels/NENormalizationLayerKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h" diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp deleted file mode 100644 index 5ea8947fa0..0000000000 --- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/NEON/kernels/NEMinMaxLayerKernel.h" - -#include "arm_compute/core/Coordinates.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/IAccessWindow.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include -#include -#include -#include - -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace -{ -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3); - - if(output->tensor_shape().total_size() > 0) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - - TensorShape output_shape = compute_min_max_shape(input); - - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); - } - - return Status{}; -} - -std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) -{ - TensorShape output_shape = compute_min_max_shape(input); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*output, output_shape, 1, input->data_type()); - - constexpr unsigned int num_elems_processed_per_iteration = 1; - - // Configure kernel window - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, 2); - - bool window_changed = update_window_and_padding(win, input_access, output_access); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_tuple(err, win); -} -} // namespace - -NEMinMaxLayerKernel::NEMinMaxLayerKernel() - : _input(nullptr), _output(nullptr), _mtx() -{ -} - -void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); - - _input = input; - _output = output; - - auto win_config = validate_and_configure_window(input->info(), output->info()); - - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - INEKernel::configure(std::get<1>(win_config)); -} - -Status NEMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output) -{ - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); - - return Status{}; -} - -void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info) -{ - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - - const int x_start = window.x().start(); - const int x_end = window.x().end(); - - Window window_output; - window_output.use_tensor_dimensions(_output->info()->tensor_shape()); - window_output.set(Window::DimX, Window::Dimension(0, 1, 1)); - - // Handle X dimension manually to split into two loops - // First one will use vector operations, second one processes the left over pixels - Window window_input(window); - window_input.set(Window::DimX, Window::Dimension(0, 1, 1)); - window_input.set(3, Window::Dimension(0, 1, 1)); - - Iterator input(_input, window_input); - Iterator output(_output, window_output); - - execute_window_loop(window_output, [&](const Coordinates & id_batch) - { - float32x2_t carry_min = vdup_n_f32(std::numeric_limits::max()); - float32x2_t carry_max = vdup_n_f32(std::numeric_limits::lowest()); - - float carry_min_scalar = std::numeric_limits::max(); - float carry_max_scalar = std::numeric_limits::lowest(); - - execute_window_loop(window_input, [&](const Coordinates &) - { - int x = x_start; - const auto in_ptr = reinterpret_cast(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]); - - // Vector loop - for(; x <= x_end - 8; x += 8) - { - const float32x4x2_t pixels = vld2q_f32(in_ptr + x); - const float32x4_t tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]); - const float32x4_t tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]); - const float32x2_t tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1)); - const float32x2_t tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1)); - carry_min = vmin_f32(tmp_min2, carry_min); - carry_max = vmax_f32(tmp_max2, carry_max); - } - - // Process leftover pixels - for(; x < x_end; ++x) - { - const float pixel = in_ptr[x]; - carry_min_scalar = std::min(pixel, carry_min_scalar); - carry_max_scalar = std::max(pixel, carry_max_scalar); - } - }, - input); - - // Reduce result - carry_min = vpmin_f32(carry_min, carry_min); - carry_max = vpmax_f32(carry_max, carry_max); - carry_min = vpmin_f32(carry_min, carry_min); - carry_max = vpmax_f32(carry_max, carry_max); - - // Extract max/min values - const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar); - const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar); - - auto out_ptr = reinterpret_cast(output.ptr()); - - // Perform reduction of local min/max values - update_min_max(out_ptr, min_i, max_i); - }, - output); -} - -void NEMinMaxLayerKernel::reset() -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - - float32x2_t reset_values = vdup_n_f32(0.0f); - reset_values = vset_lane_f32(std::numeric_limits::max(), reset_values, 0); - reset_values = vset_lane_f32(std::numeric_limits::lowest(), reset_values, 1); - - Window window_output; - window_output.use_tensor_dimensions(_output->info()->tensor_shape()); - window_output.set(Window::DimX, Window::Dimension(0, 1, 1)); - - Iterator output(_output, window_output); - - execute_window_loop(window_output, [&](const Coordinates &) - { - vst1_f32(reinterpret_cast(output.ptr()), reset_values); - }, - output); -} - -void NEMinMaxLayerKernel::update_min_max(float *out_ptr, float min, float max) -{ - arm_compute::lock_guard lock(_mtx); - - const float32x2_t old_min = vld1_dup_f32(out_ptr); - const float32x2_t old_max = vld1_dup_f32(out_ptr + 1); - const float32x2_t new_min = vmin_f32(vdup_n_f32(min), old_min); - const float32x2_t new_max = vmax_f32(vdup_n_f32(max), old_max); - - vst1_f32(out_ptr, vzip_f32(new_min, new_max).val[0]); -} -} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.h b/src/core/NEON/kernels/NEMinMaxLayerKernel.h deleted file mode 100644 index b4852ad9f2..0000000000 --- a/src/core/NEON/kernels/NEMinMaxLayerKernel.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H -#define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H - -#include "src/core/NEON/INEKernel.h" -#include "support/Mutex.h" - -#include - -namespace arm_compute -{ -class ITensor; - -/** Interface for the kernel to perform min max search on a 3D tensor. */ -class NEMinMaxLayerKernel : public INEKernel -{ -public: - const char *name() const override - { - return "NEMinMaxLayerKernel"; - } - /** Default constructor */ - NEMinMaxLayerKernel(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEMinMaxLayerKernel(const NEMinMaxLayerKernel &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEMinMaxLayerKernel &operator=(const NEMinMaxLayerKernel &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEMinMaxLayerKernel(NEMinMaxLayerKernel &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEMinMaxLayerKernel &operator=(NEMinMaxLayerKernel &&) = delete; - /** Default destructor */ - ~NEMinMaxLayerKernel() = default; - - /** Initialise the kernel's input and outputs. - * - * @note output[0] = minimum - * @note output[1] = maximum - * - * @param[in] input Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data type supported: F32. - * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum value for each 3D input tensor. - * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32 - */ - void configure(const ITensor *input, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel - * - * @param[in] input Input tensor info. Data types supported: F32. - * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor. - * The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); - /** Resets global minimum and maximum. */ - void reset(); - - // Inherited methods overridden: - void run(const Window &window, const ThreadInfo &info) override; - -private: - void update_min_max(float *out_ptr, float min, float max); - const ITensor *_input; - ITensor *_output; - arm_compute::Mutex _mtx; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_NEMINMAXLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp index f38912d257..1c4c7576f5 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp @@ -62,13 +62,11 @@ namespace depthwise { namespace { - bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp) { const auto qp = static_cast(_qp); return qp->b_offset == 0; } - } static const DepthwiseImplementation depthwise_s8q_methods[] = { diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp deleted file mode 100644 index de9b857977..0000000000 --- a/src/runtime/CL/functions/CLFillBorder.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "arm_compute/runtime/CL/functions/CLFillBorder.h" - -#include "src/core/CL/kernels/CLFillBorderKernel.h" - -#include "src/common/utils/Log.h" - -#include - -using namespace arm_compute; - -void CLFillBorder::configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - configure(CLKernelLibrary::get().get_compile_context(), tensor, border_width, border_mode, constant_border_value); -} - -void CLFillBorder::configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) -{ - ARM_COMPUTE_LOG_PARAMS(tensor, border_width, border_mode, constant_border_value); - auto k = std::make_unique(); - k->configure(compile_context, tensor, BorderSize(border_width), border_mode, constant_border_value); - _kernel = std::move(k); -} diff --git a/tests/framework/instruments/OpenCLTimer.cpp b/tests/framework/instruments/OpenCLTimer.cpp index 45eb4c5c60..e9f945bd95 100644 --- a/tests/framework/instruments/OpenCLTimer.cpp +++ b/tests/framework/instruments/OpenCLTimer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2019, 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -54,7 +54,13 @@ std::string OpenCLClock::id() const template OpenCLClock::OpenCLClock(ScaleFactor scale_factor) - : _kernels(), _real_function(nullptr), _real_graph_function(nullptr), _prefix(), _timer_enabled(false) + : _kernels(), + _real_function(nullptr), +#ifdef ARM_COMPUTE_GRAPH_ENABLED + _real_graph_function(nullptr), +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ + _prefix(), + _timer_enabled(false) { auto q = CLScheduler::get().queue(); cl_command_queue_properties props = q.getInfo(); @@ -91,19 +97,17 @@ void OpenCLClock::test_start() { // Start intercepting enqueues: ARM_COMPUTE_ERROR_ON(_real_function != nullptr); - ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr); - _real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr; - _real_graph_function = graph::TaskExecutor::get().execute_function; - auto interceptor = [this]( - cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t *gwo, - const size_t *gws, - const size_t *lws, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) + _real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + auto interceptor = [this]( + cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *gwo, + const size_t *gws, + const size_t *lws, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) { if(this->_timer_enabled) { @@ -138,7 +142,11 @@ void OpenCLClock::test_start() return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event); } }; + CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; +#ifdef ARM_COMPUTE_GRAPH_ENABLED + ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr); + _real_graph_function = graph::TaskExecutor::get().execute_function; // Start intercepting tasks: auto task_interceptor = [this](graph::ExecutionTask & task) { @@ -153,9 +161,8 @@ void OpenCLClock::test_start() this->_real_graph_function(task); this->_prefix = ""; }; - - CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; graph::TaskExecutor::get().execute_function = task_interceptor; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ } template @@ -175,9 +182,11 @@ void OpenCLClock::test_stop() { // Restore real function CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function; + _real_function = nullptr; +#ifdef ARM_COMPUTE_GRAPH_ENABLED graph::TaskExecutor::get().execute_function = _real_graph_function; _real_graph_function = nullptr; - _real_function = nullptr; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ } template diff --git a/tests/framework/instruments/OpenCLTimer.h b/tests/framework/instruments/OpenCLTimer.h index 9904035c20..1812272435 100644 --- a/tests/framework/instruments/OpenCLTimer.h +++ b/tests/framework/instruments/OpenCLTimer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 Arm Limited. + * Copyright (c) 2017-2018, 2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -67,9 +67,11 @@ private: }; std::list _kernels; std::function _real_function; - std::function _real_graph_function; - std::string _prefix; - bool _timer_enabled; +#ifdef ARM_COMPUTE_GRAPH_ENABLED + std::function _real_graph_function; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ + std::string _prefix; + bool _timer_enabled; #endif /* ARM_COMPUTE_CL */ private: diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp index 35f960d368..b753485351 100644 --- a/tests/framework/instruments/SchedulerTimer.cpp +++ b/tests/framework/instruments/SchedulerTimer.cpp @@ -129,16 +129,24 @@ protected: private: std::list::kernel_info> &_kernels; - std::map &_layer_data_map; - IScheduler &_real_scheduler; - WallClock _timer; - std::string _prefix; + std::map &_layer_data_map; + IScheduler &_real_scheduler; + WallClock _timer; + std::string _prefix; }; template SchedulerClock::SchedulerClock(ScaleFactor scale_factor) - : _kernels(), _layer_data_map(), _real_scheduler(nullptr), _real_scheduler_type(), _real_graph_function(nullptr), - _scale_factor(scale_factor), _interceptor(nullptr), _scheduler_users() + : _kernels(), + _layer_data_map(), + _real_scheduler(nullptr), + _real_scheduler_type(), +#ifdef ARM_COMPUTE_GRAPH_ENABLED + _real_graph_function(nullptr), +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ + _scale_factor(scale_factor), + _interceptor(nullptr), + _scheduler_users() { if(instruments_info != nullptr) { @@ -149,6 +157,7 @@ SchedulerClock::SchedulerClock(ScaleFactor scale_factor) template void SchedulerClock::test_start() { +#ifdef ARM_COMPUTE_GRAPH_ENABLED // Start intercepting tasks: ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr); _real_graph_function = graph::TaskExecutor::get().execute_function; @@ -182,6 +191,7 @@ void SchedulerClock::test_start() scheduler->set_prefix(""); } }; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ ARM_COMPUTE_ERROR_ON(_real_scheduler != nullptr); _real_scheduler_type = Scheduler::get_type(); @@ -191,7 +201,9 @@ void SchedulerClock::test_start() _real_scheduler = &Scheduler::get(); _interceptor = std::make_shared>(_kernels, _layer_data_map, *_real_scheduler, _scale_factor); Scheduler::set(std::static_pointer_cast(_interceptor)); +#ifdef ARM_COMPUTE_GRAPH_ENABLED graph::TaskExecutor::get().execute_function = task_interceptor; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ // Create an interceptor for each scheduler // TODO(COMPID-2638) : Allow multiple schedulers, now it assumes the same scheduler is used. @@ -217,10 +229,12 @@ void SchedulerClock::test_stop() { // Restore real scheduler Scheduler::set(_real_scheduler_type); - _real_scheduler = nullptr; - _interceptor = nullptr; + _real_scheduler = nullptr; + _interceptor = nullptr; +#ifdef ARM_COMPUTE_GRAPH_ENABLED graph::TaskExecutor::get().execute_function = _real_graph_function; _real_graph_function = nullptr; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ // Restore schedulers std::for_each(std::begin(_scheduler_users), std::end(_scheduler_users), @@ -270,9 +284,9 @@ Instrument::MeasurementsMap SchedulerClock::measurements() co } template -std::string SchedulerClock::instrument_header() const +std::string SchedulerClock::instrument_header() const { - std::string output{""}; + std::string output{ "" }; output += R"("layer_data" : {)"; for(auto i_it = _layer_data_map.cbegin(), i_end = _layer_data_map.cend(); i_it != i_end; ++i_it) { diff --git a/tests/framework/instruments/SchedulerTimer.h b/tests/framework/instruments/SchedulerTimer.h index 9cc0381a9a..c437f2717c 100644 --- a/tests/framework/instruments/SchedulerTimer.h +++ b/tests/framework/instruments/SchedulerTimer.h @@ -97,14 +97,16 @@ public: }; private: - std::list _kernels; - std::map _layer_data_map; - IScheduler *_real_scheduler; - Scheduler::Type _real_scheduler_type; + std::list _kernels; + std::map _layer_data_map; + IScheduler *_real_scheduler; + Scheduler::Type _real_scheduler_type; +#ifdef ARM_COMPUTE_GRAPH_ENABLED std::function _real_graph_function; - ScaleFactor _scale_factor; - std::shared_ptr _interceptor; - std::vector _scheduler_users; +#endif /* ARM_COMPUTE_GRAPH_ENABLED */ + ScaleFactor _scale_factor; + std::shared_ptr _interceptor; + std::vector _scheduler_users; }; using SchedulerTimer = SchedulerClock; -- cgit v1.2.1