diff options
author | Anthony Barbier <anthony.barbier@arm.com> | 2017-09-26 14:42:02 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:35:24 +0000 |
commit | 6a5627a1de8d74f0dd66b63cf31d26a8c94e107d (patch) | |
tree | 6b2f2980ed271dfb24f75a9708e69fc0038bffb4 | |
parent | 80373f607cb12693824411510c39e367a4dfbdb5 (diff) | |
download | ComputeLibrary-6a5627a1de8d74f0dd66b63cf31d26a8c94e107d.tar.gz |
COMPMID-417 Update changelog before release
Change-Id: Ia37515fb8238a03699d75751b877d5aaff5ba1a0
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/89174
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r-- | arm_compute/core/CL/CLKernels.h | 4 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEKernels.h | 3 | ||||
-rw-r--r-- | arm_compute/runtime/CL/CLFunctions.h | 2 | ||||
-rw-r--r-- | docs/00_introduction.dox | 88 | ||||
-rw-r--r-- | docs/03_scripts.dox | 8 | ||||
-rwxr-xr-x | scripts/include_functions_kernels.py | 7 |
6 files changed, 95 insertions, 17 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index de40b85080..8da0cecad5 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -47,6 +47,8 @@ #include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h" +#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h" +#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h" #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h" #include "arm_compute/core/CL/kernels/CLDilateKernel.h" #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h" @@ -76,12 +78,14 @@ #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h" #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h" +#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h" #include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h" #include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h" #include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h" #include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h" #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h" #include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h" #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h" #include "arm_compute/core/CL/kernels/CLRemapKernel.h" diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index 6fa5b5d0a4..bbb440f591 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -56,6 +56,7 @@ #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" #include "arm_compute/core/NEON/kernels/NEFloorKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" @@ -100,5 +101,7 @@ #include "arm_compute/core/NEON/kernels/NETransposeKernel.h" #include "arm_compute/core/NEON/kernels/NEWarpKernel.h" #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" +#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" #endif /* __ARM_COMPUTE_NEKERNELS_H__ */ diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 64b2deb3bf..360372d192 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -46,6 +46,7 @@ #include "arm_compute/runtime/CL/functions/CLDepthConvert.h" #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h" #include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLDerivative.h" #include "arm_compute/runtime/CL/functions/CLDilate.h" #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" @@ -85,6 +86,7 @@ #include "arm_compute/runtime/CL/functions/CLPhase.h" #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h" #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h" +#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h" #include "arm_compute/runtime/CL/functions/CLReductionOperation.h" #include "arm_compute/runtime/CL/functions/CLRemap.h" diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox index 4d514ea5ae..8eea0636aa 100644 --- a/docs/00_introduction.dox +++ b/docs/00_introduction.dox @@ -36,33 +36,50 @@ You should have the following file organisation: ├── arm_compute --> All the arm_compute headers │ ├── core │ │ ├── CL + │ │ │ ├── CLKernelLibrary.h --> Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context. │ │ │ ├── CLKernels.h --> Includes all the OpenCL kernels at once │ │ │ ├── CL specialisation of all the generic objects interfaces (ICLTensor, ICLImage, etc.) │ │ │ ├── kernels --> Folder containing all the OpenCL kernels │ │ │ │ └── CL*Kernel.h │ │ │ └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header │ │ ├── CPP + │ │ │ ├── CPPKernels.h --> Includes all the CPP kernels at once │ │ │ └── kernels --> Folder containing all the CPP kernels - │ │ │ │ └── CPP*Kernel.h + │ │ │ └── CPP*Kernel.h │ │ ├── NEON │ │ │ ├── kernels --> Folder containing all the NEON kernels + │ │ │ │ ├── arm64 --> Folder containing the interfaces for the assembly arm64 NEON kernels + │ │ │ │ ├── arm32 --> Folder containing the interfaces for the assembly arm32 NEON kernels + │ │ │ │ ├── assembly --> Folder containing the NEON assembly routines. │ │ │ │ └── NE*Kernel.h │ │ │ └── NEKernels.h --> Includes all the NEON kernels at once │ │ ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.) │ │ ├── All generic objects interfaces (ITensor, IImage, etc.) │ │ └── Objects metadata classes (ImageInfo, TensorInfo, MultiImageInfo) + │ ├── graph + │ │ ├── CL --> OpenCL specific operations + │ │ │ └── CLMap.h / CLUnmap.h + │ │ ├── nodes + │ │ │ └── The various nodes supported by the graph API + │ │ ├── Nodes.h --> Includes all the Graph nodes at once. + │ │ └── Graph objects ( INode, ITensorAccessor, Graph, etc.) │ └── runtime │ ├── CL │ │ ├── CL objects & allocators (CLArray, CLImage, CLTensor, etc.) │ │ ├── functions --> Folder containing all the OpenCL functions │ │ │ └── CL*.h + │ │ ├── CLScheduler.h --> Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner. │ │ └── CLFunctions.h --> Includes all the OpenCL functions at once │ ├── CPP - │ │ └── Scheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel + │ │ ├── CPPKernels.h --> Includes all the CPP functions at once. + │ │ └── CPPScheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel │ ├── NEON │ │ ├── functions --> Folder containing all the NEON functions │ │ │ └── NE*.h │ │ └── NEFunctions.h --> Includes all the NEON functions at once + │ ├── OMP + │ │ └── OMPScheduler.h --> OpenMP scheduler (Alternative to the CPPScheduler) + │ ├── Memory manager files (LifetimeManager, PoolManager, etc.) │ └── Basic implementations of the generic object interfaces (Array, Image, Tensor, etc.) ├── documentation │ ├── index.xhtml @@ -74,32 +91,47 @@ You should have the following file organisation: │ ├── neon_convolution.cpp │ └── neon_scale.cpp ├── include - │ └── CL - │ └── Khronos OpenCL C headers and C++ wrapper + │ ├── CL + │ │ └── Khronos OpenCL C headers and C++ wrapper + │ ├── half --> FP16 library available from http://half.sourceforge.net + │ └── libnpy --> Library to load / write npy buffers, available from https://github.com/llohse/libnpy ├── opencl-1.2-stubs │ └── opencl_stubs.c + ├── scripts + │ ├── caffe_data_extractor.py --> Basic script to export weights from Caffe to npy files + │ └── tensorflow_data_extractor.py --> Basic script to export weights from Tensor Flow to npy files ├── src │ ├── core │ │ └── ... (Same structure as headers) │ │ └── CL │ │ └── cl_kernels --> All the OpenCL kernels + │ ├── graph + │ │ └── ... (Same structure as headers) │ └── runtime │ └── ... (Same structure as headers) + ├── support + │ └── Various headers to work around toolchains / platform issues. ├── tests │ ├── All test related files shared between validation and benchmark - │ ├── CL --> OpenCL specific files (shared) - │ ├── NEON --> NEON specific files (shared) + │ ├── CL --> OpenCL accessors + │ ├── NEON --> NEON accessors │ ├── benchmark --> Sources for benchmarking │ │ ├── Benchmark specific files - │ │ ├── main.cpp --> Entry point for benchmark test framework │ │ ├── CL --> OpenCL benchmarking tests │ │ └── NEON --> NEON benchmarking tests + │ ├── datasets + │ │ └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc. + │ ├── framework + │ │ └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.) + │ ├── networks + │ │ └── Examples of how to instantiate networks. │ ├── validation --> Sources for validation │ │ ├── Validation specific files - │ │ ├── main.cpp --> Entry point for validation test framework │ │ ├── CL --> OpenCL validation tests - │ │ ├── NEON --> NEON validation tests - │ │ └── UNIT --> Library validation tests + │ │ ├── CPP --> C++ reference implementations + │ │ ├── fixtures + │ │ │ └── Fixtures to initialise and run the runtime Functions. + │ │ └── NEON --> NEON validation tests │ └── dataset --> Datasets defining common sets of input parameters └── utils --> Boiler plate code used by examples └── Utils.h @@ -119,6 +151,35 @@ If there is more than one release in a month then an extra sequential number is @subsection S2_2_changelog Changelog +v17.09 Public major release + - Experimental Graph support: initial implementation of a simple stream API to easily chain machine learning layers. + - Memory Manager (@ref arm_compute::BlobLifetimeManager, @ref arm_compute::BlobMemoryPool, @ref arm_compute::ILifetimeManager, @ref arm_compute::IMemoryGroup, @ref arm_compute::IMemoryManager, @ref arm_compute::IMemoryPool, @ref arm_compute::IPoolManager, @ref arm_compute::MemoryManagerOnDemand, @ref arm_compute::PoolManager) + - New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework). + - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL. + - New NEON kernels / functions: + - @ref arm_compute::NEGEMMAssemblyBaseKernel @ref arm_compute::NEGEMMAArch64Kernel + - @ref arm_compute::NEDequantizationLayerKernel / @ref arm_compute::NEDequantizationLayer + - @ref arm_compute::NEFloorKernel / @ref arm_compute::NEFloor + - @ref arm_compute::NEL2NormalizeKernel / @ref arm_compute::NEL2Normalize + - @ref arm_compute::NEQuantizationLayerKernel @ref arm_compute::NEMinMaxLayerKernel / @ref arm_compute::NEQuantizationLayer + - @ref arm_compute::NEROIPoolingLayerKernel / @ref arm_compute::NEROIPoolingLayer + - @ref arm_compute::NEReductionOperationKernel / @ref arm_compute::NEReductionOperation + - @ref arm_compute::NEReshapeLayerKernel / @ref arm_compute::NEReshapeLayer + + - New OpenCL kernels / functions: + - @ref arm_compute::CLDepthwiseConvolution3x3Kernel @ref arm_compute::CLDepthwiseIm2ColKernel @ref arm_compute::CLDepthwiseVectorToTensorKernel @ref arm_compute::CLDepthwiseWeightsReshapeKernel / @ref arm_compute::CLDepthwiseConvolution3x3 @ref arm_compute::CLDepthwiseConvolution @ref arm_compute::CLDepthwiseSeparableConvolutionLayer + - @ref arm_compute::CLDequantizationLayerKernel / @ref arm_compute::CLDequantizationLayer + - @ref arm_compute::CLDirectConvolutionLayerKernel / @ref arm_compute::CLDirectConvolutionLayer + - @ref arm_compute::CLFlattenLayer + - @ref arm_compute::CLFloorKernel / @ref arm_compute::CLFloor + - @ref arm_compute::CLGEMMTranspose1xW + - @ref arm_compute::CLGEMMMatrixVectorMultiplyKernel + - @ref arm_compute::CLL2NormalizeKernel / @ref arm_compute::CLL2Normalize + - @ref arm_compute::CLQuantizationLayerKernel @ref arm_compute::CLMinMaxLayerKernel / @ref arm_compute::CLQuantizationLayer + - @ref arm_compute::CLROIPoolingLayerKernel / @ref arm_compute::CLROIPoolingLayer + - @ref arm_compute::CLReductionOperationKernel / @ref arm_compute::CLReductionOperation + - @ref arm_compute::CLReshapeLayerKernel / @ref arm_compute::CLReshapeLayer + v17.06 Public major release - Various bug fixes - Added support for fixed point 8 bit (QS8) to the various NEON machine learning kernels. @@ -172,7 +233,6 @@ v17.04 Public bug fixes release - @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel - @ref arm_compute::NENonMaximaSuppression3x3Kernel - v17.03.1 First Major public release of the sources - Renamed the library to arm_compute - New CPP target introduced for C++ kernels shared between NEON and CL functions. @@ -303,6 +363,10 @@ To see the build options available simply run ```scons -h```: default: False actual: False + mali: Enable Mali hardware counters (yes|no) + default: False + actual: False + validation_tests: Build validation test programs (yes|no) default: False actual: False @@ -355,6 +419,8 @@ Example: @b pmu: Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it) +@b mali: Enable the collection of Mali hardware counters to measure execution time in benchmark tests. (Your device needs to have a Mali driver that supports it) + @b openmp Build in the OpenMP scheduler for NEON. @note Only works when building with g++ not clang++ diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox index 2fd3907978..5601428ac2 100644 --- a/docs/03_scripts.dox +++ b/docs/03_scripts.dox @@ -13,7 +13,7 @@ extract the parameter values from a trained model. @note complex networks might require altering the script to properly work. -@subsection how_to How to use the script +@subsection caffe_how_to How to use the script Install caffe following <a href="http://caffe.berkeleyvision.org/installation.html">caffe's document</a>. Make sure the pycaffe has been added into the PYTHONPATH. @@ -30,7 +30,7 @@ For example, to extract the data from pre-trained caffe Alex model to binary fil The script has been tested under Python2.7. -@subsection result What is the expected output from the script +@subsection caffe_result What is the expected output from the script If the script runs successfully, it prints the names and shapes of each layer onto the standard output and generates *.npy files containing the weights and biases of each layer. @@ -60,7 +60,7 @@ when dealing with binary files with version < 0.11, pass the whole file name {mo specified otherwise by the user. Thus should a user alter this default behavior and/or want to extract parameters from other collections, tf.GraphKeys.TRAINABLE_VARIABLES should be replaced accordingly. -@subsection how_to How to use the script +@subsection tensorflow_how_to How to use the script Install tensorflow and numpy. @@ -82,7 +82,7 @@ Or for binary checkpoint files before Tensorflow 0.11: The script has been tested with Tensorflow 1.2, 1.3 on Python 2.7.6 and Python 3.4.3. -@subsection result What is the expected output from the script +@subsection tensorflow_result What is the expected output from the script If the script runs successfully, it prints the names and shapes of each parameter onto the standard output and generates *.npy files containing the weights and biases of each layer. diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py index ab60343c4d..e6e5f5e7d5 100755 --- a/scripts/include_functions_kernels.py +++ b/scripts/include_functions_kernels.py @@ -45,12 +45,15 @@ def create_include_list(folder): return updated_files -def include_components(path, header_prefix, folder): +def include_components(path, header_prefix, folder, subfolders=None): for t in targets: target_path = path + t.name + "/" components_file = target_path + t.prefix + header_prefix if os.path.exists(components_file): include_list = create_include_list(target_path + folder) + for s in subfolders or []: + include_list += create_include_list( target_path + folder + "/" + s) + include_list.sort() lines = read_file(components_file) lines, first_pos = remove_existing_includes(lines) lines = add_updated_includes(lines, first_pos, include_list) @@ -59,7 +62,7 @@ def include_components(path, header_prefix, folder): if __name__ == "__main__": # Include kernels - include_components(core_path, "Kernels.h", "kernels") + include_components(core_path, "Kernels.h", "kernels", ["arm32", "arm64"]) # Include functions include_components(runtime_path, "Functions.h", "functions") |