aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/CL/CLKernels.h4
-rw-r--r--arm_compute/core/NEON/NEKernels.h3
-rw-r--r--arm_compute/runtime/CL/CLFunctions.h2
-rw-r--r--docs/00_introduction.dox88
-rw-r--r--docs/03_scripts.dox8
-rwxr-xr-xscripts/include_functions_kernels.py7
6 files changed, 95 insertions, 17 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index de40b85080..8da0cecad5 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -47,6 +47,8 @@
#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
#include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
@@ -76,12 +78,14 @@
#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 6fa5b5d0a4..bbb440f591 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -56,6 +56,7 @@
#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
@@ -100,5 +101,7 @@
#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
#endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 64b2deb3bf..360372d192 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -46,6 +46,7 @@
#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
#include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLDerivative.h"
#include "arm_compute/runtime/CL/functions/CLDilate.h"
#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
@@ -85,6 +86,7 @@
#include "arm_compute/runtime/CL/functions/CLPhase.h"
#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
#include "arm_compute/runtime/CL/functions/CLRemap.h"
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 4d514ea5ae..8eea0636aa 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -36,33 +36,50 @@ You should have the following file organisation:
├── arm_compute --> All the arm_compute headers
│   ├── core
│   │   ├── CL
+ │   │   │   ├── CLKernelLibrary.h --> Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
│   │   │   ├── CLKernels.h --> Includes all the OpenCL kernels at once
│   │   │   ├── CL specialisation of all the generic objects interfaces (ICLTensor, ICLImage, etc.)
│   │   │   ├── kernels --> Folder containing all the OpenCL kernels
│   │   │   │   └── CL*Kernel.h
│   │   │   └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header
│   │ ├── CPP
+ │   │   │   ├── CPPKernels.h --> Includes all the CPP kernels at once
│   │ │   └── kernels --> Folder containing all the CPP kernels
- │   │   │   │   └── CPP*Kernel.h
+ │   │   │      └── CPP*Kernel.h
│   │   ├── NEON
│   │   │   ├── kernels --> Folder containing all the NEON kernels
+ │   │   │   │ ├── arm64 --> Folder containing the interfaces for the assembly arm64 NEON kernels
+ │   │   │   │ ├── arm32 --> Folder containing the interfaces for the assembly arm32 NEON kernels
+ │   │   │   │ ├── assembly --> Folder containing the NEON assembly routines.
│   │   │   │   └── NE*Kernel.h
│   │   │   └── NEKernels.h --> Includes all the NEON kernels at once
│   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
│   │   ├── All generic objects interfaces (ITensor, IImage, etc.)
│   │   └── Objects metadata classes (ImageInfo, TensorInfo, MultiImageInfo)
+ │   ├── graph
+ │   │   ├── CL --> OpenCL specific operations
+ │   │   │   └── CLMap.h / CLUnmap.h
+ │   │   ├── nodes
+ │   │   │   └── The various nodes supported by the graph API
+ │   │   ├── Nodes.h --> Includes all the Graph nodes at once.
+ │   │   └── Graph objects ( INode, ITensorAccessor, Graph, etc.)
│   └── runtime
│   ├── CL
│   │   ├── CL objects & allocators (CLArray, CLImage, CLTensor, etc.)
│   │   ├── functions --> Folder containing all the OpenCL functions
│   │   │   └── CL*.h
+ │   │   ├── CLScheduler.h --> Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
│   │   └── CLFunctions.h --> Includes all the OpenCL functions at once
│   ├── CPP
- │   │   └── Scheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
+ │      │   ├── CPPKernels.h --> Includes all the CPP functions at once.
+ │   │   └── CPPScheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
│   ├── NEON
│   │ ├── functions --> Folder containing all the NEON functions
│   │ │   └── NE*.h
│   │ └── NEFunctions.h --> Includes all the NEON functions at once
+ │   ├── OMP
+ │   │   └── OMPScheduler.h --> OpenMP scheduler (Alternative to the CPPScheduler)
+ │ ├── Memory manager files (LifetimeManager, PoolManager, etc.)
│   └── Basic implementations of the generic object interfaces (Array, Image, Tensor, etc.)
├── documentation
│   ├── index.xhtml
@@ -74,32 +91,47 @@ You should have the following file organisation:
│   ├── neon_convolution.cpp
│   └── neon_scale.cpp
├── include
- │   └── CL
- │   └── Khronos OpenCL C headers and C++ wrapper
+ │   ├── CL
+ │   │ └── Khronos OpenCL C headers and C++ wrapper
+ │   ├── half --> FP16 library available from http://half.sourceforge.net
+ │  └── libnpy --> Library to load / write npy buffers, available from https://github.com/llohse/libnpy
├── opencl-1.2-stubs
│ └── opencl_stubs.c
+ ├── scripts
+ │   ├── caffe_data_extractor.py --> Basic script to export weights from Caffe to npy files
+ │   └── tensorflow_data_extractor.py --> Basic script to export weights from Tensor Flow to npy files
├── src
│   ├── core
│ │ └── ... (Same structure as headers)
│   │ └── CL
│   │ └── cl_kernels --> All the OpenCL kernels
+ │   ├── graph
+ │ │ └── ... (Same structure as headers)
│ └── runtime
│ └── ... (Same structure as headers)
+ ├── support
+ │ └── Various headers to work around toolchains / platform issues.
├── tests
│   ├── All test related files shared between validation and benchmark
- │   ├── CL --> OpenCL specific files (shared)
- │   ├── NEON --> NEON specific files (shared)
+ │   ├── CL --> OpenCL accessors
+ │   ├── NEON --> NEON accessors
│   ├── benchmark --> Sources for benchmarking
│ │ ├── Benchmark specific files
- │ │ ├── main.cpp --> Entry point for benchmark test framework
│ │ ├── CL --> OpenCL benchmarking tests
│ │ └── NEON --> NEON benchmarking tests
+ │   ├── datasets
+ │ │ └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc.
+ │   ├── framework
+ │ │ └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
+ │   ├── networks
+ │ │ └── Examples of how to instantiate networks.
│   ├── validation --> Sources for validation
│ │ ├── Validation specific files
- │ │ ├── main.cpp --> Entry point for validation test framework
│ │ ├── CL --> OpenCL validation tests
- │ │ ├── NEON --> NEON validation tests
- │ │ └── UNIT --> Library validation tests
+ │ │ ├── CPP --> C++ reference implementations
+ │   │ ├── fixtures
+ │ │ │ └── Fixtures to initialise and run the runtime Functions.
+ │ │ └── NEON --> NEON validation tests
│   └── dataset --> Datasets defining common sets of input parameters
└── utils --> Boiler plate code used by examples
└── Utils.h
@@ -119,6 +151,35 @@ If there is more than one release in a month then an extra sequential number is
@subsection S2_2_changelog Changelog
+v17.09 Public major release
+ - Experimental Graph support: initial implementation of a simple stream API to easily chain machine learning layers.
+ - Memory Manager (@ref arm_compute::BlobLifetimeManager, @ref arm_compute::BlobMemoryPool, @ref arm_compute::ILifetimeManager, @ref arm_compute::IMemoryGroup, @ref arm_compute::IMemoryManager, @ref arm_compute::IMemoryPool, @ref arm_compute::IPoolManager, @ref arm_compute::MemoryManagerOnDemand, @ref arm_compute::PoolManager)
+ - New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework).
+ - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL.
+ - New NEON kernels / functions:
+ - @ref arm_compute::NEGEMMAssemblyBaseKernel @ref arm_compute::NEGEMMAArch64Kernel
+ - @ref arm_compute::NEDequantizationLayerKernel / @ref arm_compute::NEDequantizationLayer
+ - @ref arm_compute::NEFloorKernel / @ref arm_compute::NEFloor
+ - @ref arm_compute::NEL2NormalizeKernel / @ref arm_compute::NEL2Normalize
+ - @ref arm_compute::NEQuantizationLayerKernel @ref arm_compute::NEMinMaxLayerKernel / @ref arm_compute::NEQuantizationLayer
+ - @ref arm_compute::NEROIPoolingLayerKernel / @ref arm_compute::NEROIPoolingLayer
+ - @ref arm_compute::NEReductionOperationKernel / @ref arm_compute::NEReductionOperation
+ - @ref arm_compute::NEReshapeLayerKernel / @ref arm_compute::NEReshapeLayer
+
+ - New OpenCL kernels / functions:
+ - @ref arm_compute::CLDepthwiseConvolution3x3Kernel @ref arm_compute::CLDepthwiseIm2ColKernel @ref arm_compute::CLDepthwiseVectorToTensorKernel @ref arm_compute::CLDepthwiseWeightsReshapeKernel / @ref arm_compute::CLDepthwiseConvolution3x3 @ref arm_compute::CLDepthwiseConvolution @ref arm_compute::CLDepthwiseSeparableConvolutionLayer
+ - @ref arm_compute::CLDequantizationLayerKernel / @ref arm_compute::CLDequantizationLayer
+ - @ref arm_compute::CLDirectConvolutionLayerKernel / @ref arm_compute::CLDirectConvolutionLayer
+ - @ref arm_compute::CLFlattenLayer
+ - @ref arm_compute::CLFloorKernel / @ref arm_compute::CLFloor
+ - @ref arm_compute::CLGEMMTranspose1xW
+ - @ref arm_compute::CLGEMMMatrixVectorMultiplyKernel
+ - @ref arm_compute::CLL2NormalizeKernel / @ref arm_compute::CLL2Normalize
+ - @ref arm_compute::CLQuantizationLayerKernel @ref arm_compute::CLMinMaxLayerKernel / @ref arm_compute::CLQuantizationLayer
+ - @ref arm_compute::CLROIPoolingLayerKernel / @ref arm_compute::CLROIPoolingLayer
+ - @ref arm_compute::CLReductionOperationKernel / @ref arm_compute::CLReductionOperation
+ - @ref arm_compute::CLReshapeLayerKernel / @ref arm_compute::CLReshapeLayer
+
v17.06 Public major release
- Various bug fixes
- Added support for fixed point 8 bit (QS8) to the various NEON machine learning kernels.
@@ -172,7 +233,6 @@ v17.04 Public bug fixes release
- @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel
- @ref arm_compute::NENonMaximaSuppression3x3Kernel
-
v17.03.1 First Major public release of the sources
- Renamed the library to arm_compute
- New CPP target introduced for C++ kernels shared between NEON and CL functions.
@@ -303,6 +363,10 @@ To see the build options available simply run ```scons -h```:
default: False
actual: False
+ mali: Enable Mali hardware counters (yes|no)
+ default: False
+ actual: False
+
validation_tests: Build validation test programs (yes|no)
default: False
actual: False
@@ -355,6 +419,8 @@ Example:
@b pmu: Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it)
+@b mali: Enable the collection of Mali hardware counters to measure execution time in benchmark tests. (Your device needs to have a Mali driver that supports it)
+
@b openmp Build in the OpenMP scheduler for NEON.
@note Only works when building with g++ not clang++
diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
index 2fd3907978..5601428ac2 100644
--- a/docs/03_scripts.dox
+++ b/docs/03_scripts.dox
@@ -13,7 +13,7 @@ extract the parameter values from a trained model.
@note complex networks might require altering the script to properly work.
-@subsection how_to How to use the script
+@subsection caffe_how_to How to use the script
Install caffe following <a href="http://caffe.berkeleyvision.org/installation.html">caffe's document</a>.
Make sure the pycaffe has been added into the PYTHONPATH.
@@ -30,7 +30,7 @@ For example, to extract the data from pre-trained caffe Alex model to binary fil
The script has been tested under Python2.7.
-@subsection result What is the expected output from the script
+@subsection caffe_result What is the expected output from the script
If the script runs successfully, it prints the names and shapes of each layer onto the standard
output and generates *.npy files containing the weights and biases of each layer.
@@ -60,7 +60,7 @@ when dealing with binary files with version < 0.11, pass the whole file name {mo
specified otherwise by the user. Thus should a user alter this default behavior and/or want to extract parameters from other
collections, tf.GraphKeys.TRAINABLE_VARIABLES should be replaced accordingly.
-@subsection how_to How to use the script
+@subsection tensorflow_how_to How to use the script
Install tensorflow and numpy.
@@ -82,7 +82,7 @@ Or for binary checkpoint files before Tensorflow 0.11:
The script has been tested with Tensorflow 1.2, 1.3 on Python 2.7.6 and Python 3.4.3.
-@subsection result What is the expected output from the script
+@subsection tensorflow_result What is the expected output from the script
If the script runs successfully, it prints the names and shapes of each parameter onto the standard output and generates
*.npy files containing the weights and biases of each layer.
diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py
index ab60343c4d..e6e5f5e7d5 100755
--- a/scripts/include_functions_kernels.py
+++ b/scripts/include_functions_kernels.py
@@ -45,12 +45,15 @@ def create_include_list(folder):
return updated_files
-def include_components(path, header_prefix, folder):
+def include_components(path, header_prefix, folder, subfolders=None):
for t in targets:
target_path = path + t.name + "/"
components_file = target_path + t.prefix + header_prefix
if os.path.exists(components_file):
include_list = create_include_list(target_path + folder)
+ for s in subfolders or []:
+ include_list += create_include_list( target_path + folder + "/" + s)
+ include_list.sort()
lines = read_file(components_file)
lines, first_pos = remove_existing_includes(lines)
lines = add_updated_includes(lines, first_pos, include_list)
@@ -59,7 +62,7 @@ def include_components(path, header_prefix, folder):
if __name__ == "__main__":
# Include kernels
- include_components(core_path, "Kernels.h", "kernels")
+ include_components(core_path, "Kernels.h", "kernels", ["arm32", "arm64"])
# Include functions
include_components(runtime_path, "Functions.h", "functions")