6 files changed, 95 insertions, 17 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index de40b85080..8da0cecad5 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -47,6 +47,8 @@
 #include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h"
+#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLDilateKernel.h"
 #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
@@ -76,12 +78,14 @@
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
 #include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
 #include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLRemapKernel.h"
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 6fa5b5d0a4..bbb440f591 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -56,6 +56,7 @@
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
@@ -100,5 +101,7 @@
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
 
 #endif /* __ARM_COMPUTE_NEKERNELS_H__ */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 64b2deb3bf..360372d192 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -46,6 +46,7 @@
 #include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDerivative.h"
 #include "arm_compute/runtime/CL/functions/CLDilate.h"
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
@@ -85,6 +86,7 @@
 #include "arm_compute/runtime/CL/functions/CLPhase.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/CL/functions/CLRemap.h"
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 4d514ea5ae..8eea0636aa 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -36,33 +36,50 @@ You should have the following file organisation:
 	├── arm_compute --> All the arm_compute headers
 	│   ├── core
 	│   │   ├── CL
+	│   │   │   ├── CLKernelLibrary.h --> Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context.
 	│   │   │   ├── CLKernels.h --> Includes all the OpenCL kernels at once
 	│   │   │   ├── CL specialisation of all the generic objects interfaces (ICLTensor, ICLImage, etc.)
 	│   │   │   ├── kernels --> Folder containing all the OpenCL kernels
 	│   │   │   │   └── CL*Kernel.h
 	│   │   │   └── OpenCL.h --> Wrapper to configure the Khronos OpenCL C++ header
 	│   │   ├── CPP
+	│   │   │   ├── CPPKernels.h --> Includes all the CPP kernels at once
 	│   │   │   └── kernels --> Folder containing all the CPP kernels
-	│   │   │   │   └── CPP*Kernel.h
+	│   │   │       └── CPP*Kernel.h
 	│   │   ├── NEON
 	│   │   │   ├── kernels --> Folder containing all the NEON kernels
+	│   │   │   │   ├── arm64 --> Folder containing the interfaces for the assembly arm64 NEON kernels
+	│   │   │   │   ├── arm32 --> Folder containing the interfaces for the assembly arm32 NEON kernels
+	│   │   │   │   ├── assembly --> Folder containing the NEON assembly routines.
 	│   │   │   │   └── NE*Kernel.h
 	│   │   │   └── NEKernels.h --> Includes all the NEON kernels at once
 	│   │   ├── All common basic types (Types.h, Window, Coordinates, Iterator, etc.)
 	│   │   ├── All generic objects interfaces (ITensor, IImage, etc.)
 	│   │   └── Objects metadata classes (ImageInfo, TensorInfo, MultiImageInfo)
+	│   ├── graph
+	│   │   ├── CL --> OpenCL specific operations
+	│   │   │   └── CLMap.h / CLUnmap.h
+	│   │   ├── nodes
+	│   │   │   └── The various nodes supported by the graph API
+	│   │   ├── Nodes.h --> Includes all the Graph nodes at once.
+	│   │   └── Graph objects ( INode, ITensorAccessor, Graph, etc.)
 	│   └── runtime
 	│       ├── CL
 	│       │   ├── CL objects & allocators (CLArray, CLImage, CLTensor, etc.)
 	│       │   ├── functions --> Folder containing all the OpenCL functions
 	│       │   │   └── CL*.h
+	│       │   ├── CLScheduler.h --> Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
 	│       │   └── CLFunctions.h --> Includes all the OpenCL functions at once
 	│       ├── CPP
-	│       │   └── Scheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
+	│       │   ├── CPPKernels.h --> Includes all the CPP functions at once.
+	│       │   └── CPPScheduler.h --> Basic pool of threads to execute CPP/NEON code on several cores in parallel
 	│       ├── NEON
 	│       │   ├── functions --> Folder containing all the NEON functions
 	│       │   │   └── NE*.h
 	│       │   └── NEFunctions.h --> Includes all the NEON functions at once
+	│       ├── OMP
+	│       │   └── OMPScheduler.h --> OpenMP scheduler (Alternative to the CPPScheduler)
+	│       ├── Memory manager files (LifetimeManager, PoolManager, etc.)
 	│       └── Basic implementations of the generic object interfaces (Array, Image, Tensor, etc.)
 	├── documentation
 	│   ├── index.xhtml
@@ -74,32 +91,47 @@ You should have the following file organisation:
 	│   ├── neon_convolution.cpp
 	│   └── neon_scale.cpp
 	├── include
-	│   └── CL
-	│       └── Khronos OpenCL C headers and C++ wrapper
+	│   ├── CL
+	│   │   └── Khronos OpenCL C headers and C++ wrapper
+	│   ├── half --> FP16 library available from http://half.sourceforge.net
+	│   └── libnpy --> Library to load / write npy buffers, available from https://github.com/llohse/libnpy
 	├── opencl-1.2-stubs
 	│   └── opencl_stubs.c
+	├── scripts
+	│   ├── caffe_data_extractor.py --> Basic script to export weights from Caffe to npy files
+	│   └── tensorflow_data_extractor.py --> Basic script to export weights from Tensor Flow to npy files
 	├── src
 	│   ├── core
 	│   │   └── ... (Same structure as headers)
 	│   │       └── CL
 	│   │           └── cl_kernels --> All the OpenCL kernels
+	│   ├── graph
+	│   │   └── ... (Same structure as headers)
 	│   └── runtime
 	│       └── ... (Same structure as headers)
+	├── support
+	│   └── Various headers to work around toolchains / platform issues.
 	├── tests
 	│   ├── All test related files shared between validation and benchmark
-	│   ├── CL --> OpenCL specific files (shared)
-	│   ├── NEON --> NEON specific files (shared)
+	│   ├── CL --> OpenCL accessors
+	│   ├── NEON --> NEON accessors
 	│   ├── benchmark --> Sources for benchmarking
 	│   │   ├── Benchmark specific files
-	│   │   ├── main.cpp --> Entry point for benchmark test framework
 	│   │   ├── CL --> OpenCL benchmarking tests
 	│   │   └── NEON --> NEON benchmarking tests
+	│   ├── datasets
+	│   │   └── Datasets for all the validation / benchmark tests, layer configurations for various networks, etc.
+	│   ├── framework
+	│   │   └── Boiler plate code for both validation and benchmark test suites (Command line parsers, instruments, output loggers, etc.)
+	│   ├── networks
+	│   │   └── Examples of how to instantiate networks.
 	│   ├── validation --> Sources for validation
 	│   │   ├── Validation specific files
-	│   │   ├── main.cpp --> Entry point for validation test framework
 	│   │   ├── CL --> OpenCL validation tests
-	│   │   ├── NEON --> NEON validation tests
-	│   │   └── UNIT --> Library validation tests
+	│   │   ├── CPP --> C++ reference implementations
+	│   │   ├── fixtures
+	│   │   │   └── Fixtures to initialise and run the runtime Functions.
+	│   │   └── NEON --> NEON validation tests
 	│   └── dataset --> Datasets defining common sets of input parameters
 	└── utils --> Boiler plate code used by examples
 	    └── Utils.h
@@ -119,6 +151,35 @@ If there is more than one release in a month then an extra sequential number is
 
 @subsection S2_2_changelog Changelog
 
+v17.09 Public major release
+ - Experimental Graph support: initial implementation of a simple stream API to easily chain machine learning layers.
+ - Memory Manager (@ref arm_compute::BlobLifetimeManager, @ref arm_compute::BlobMemoryPool, @ref arm_compute::ILifetimeManager, @ref arm_compute::IMemoryGroup, @ref arm_compute::IMemoryManager, @ref arm_compute::IMemoryPool, @ref arm_compute::IPoolManager, @ref arm_compute::MemoryManagerOnDemand, @ref arm_compute::PoolManager)
+ - New validation and benchmark frameworks (Boost and Google frameworks replaced by homemade framework).
+ - Most machine learning functions support both fixed point 8 and 16 bit (QS8, QS16) for both NEON and OpenCL.
+ - New NEON kernels / functions:
+    - @ref arm_compute::NEGEMMAssemblyBaseKernel @ref arm_compute::NEGEMMAArch64Kernel
+    - @ref arm_compute::NEDequantizationLayerKernel / @ref arm_compute::NEDequantizationLayer
+    - @ref arm_compute::NEFloorKernel / @ref arm_compute::NEFloor
+    - @ref arm_compute::NEL2NormalizeKernel / @ref arm_compute::NEL2Normalize
+    - @ref arm_compute::NEQuantizationLayerKernel @ref arm_compute::NEMinMaxLayerKernel / @ref arm_compute::NEQuantizationLayer
+    - @ref arm_compute::NEROIPoolingLayerKernel / @ref arm_compute::NEROIPoolingLayer
+    - @ref arm_compute::NEReductionOperationKernel / @ref arm_compute::NEReductionOperation
+    - @ref arm_compute::NEReshapeLayerKernel / @ref arm_compute::NEReshapeLayer
+
+ - New OpenCL kernels / functions:
+    - @ref arm_compute::CLDepthwiseConvolution3x3Kernel @ref arm_compute::CLDepthwiseIm2ColKernel @ref arm_compute::CLDepthwiseVectorToTensorKernel @ref arm_compute::CLDepthwiseWeightsReshapeKernel / @ref arm_compute::CLDepthwiseConvolution3x3 @ref arm_compute::CLDepthwiseConvolution @ref arm_compute::CLDepthwiseSeparableConvolutionLayer
+    - @ref arm_compute::CLDequantizationLayerKernel / @ref arm_compute::CLDequantizationLayer
+    - @ref arm_compute::CLDirectConvolutionLayerKernel / @ref arm_compute::CLDirectConvolutionLayer
+    - @ref arm_compute::CLFlattenLayer
+    - @ref arm_compute::CLFloorKernel / @ref arm_compute::CLFloor
+    - @ref arm_compute::CLGEMMTranspose1xW
+    - @ref arm_compute::CLGEMMMatrixVectorMultiplyKernel
+    - @ref arm_compute::CLL2NormalizeKernel / @ref arm_compute::CLL2Normalize
+    - @ref arm_compute::CLQuantizationLayerKernel @ref arm_compute::CLMinMaxLayerKernel / @ref arm_compute::CLQuantizationLayer
+    - @ref arm_compute::CLROIPoolingLayerKernel / @ref arm_compute::CLROIPoolingLayer
+    - @ref arm_compute::CLReductionOperationKernel / @ref arm_compute::CLReductionOperation
+    - @ref arm_compute::CLReshapeLayerKernel / @ref arm_compute::CLReshapeLayer
+
 v17.06 Public major release
  - Various bug fixes
  - Added support for fixed point 8 bit (QS8) to the various NEON machine learning kernels.
@@ -172,7 +233,6 @@ v17.04 Public bug fixes release
  -  @ref arm_compute::NENonMaximaSuppression3x3FP16Kernel
  -  @ref arm_compute::NENonMaximaSuppression3x3Kernel
 
-
 v17.03.1 First Major public release of the sources
  - Renamed the library to arm_compute
  - New CPP target introduced for C++ kernels shared between NEON and CL functions.
@@ -303,6 +363,10 @@ To see the build options available simply run ```scons -h```:
 		default: False
 		actual: False
 
+	mali: Enable Mali hardware counters (yes|no)
+		default: False
+		actual: False
+
 	validation_tests: Build validation test programs (yes|no)
 		default: False
 		actual: False
@@ -355,6 +419,8 @@ Example:
 
 @b pmu: Enable the PMU cycle counter to measure execution time in benchmark tests. (Your device needs to support it)
 
+@b mali: Enable the collection of Mali hardware counters to measure execution time in benchmark tests. (Your device needs to have a Mali driver that supports it)
+
 @b openmp Build in the OpenMP scheduler for NEON.
 
 @note Only works when building with g++ not clang++
diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
index 2fd3907978..5601428ac2 100644
--- a/docs/03_scripts.dox
+++ b/docs/03_scripts.dox
@@ -13,7 +13,7 @@ extract the parameter values from a trained model.
 
 @note complex networks might require altering the script to properly work.
 
-@subsection how_to How to use the script
+@subsection caffe_how_to How to use the script
 
 Install caffe following <a href="http://caffe.berkeleyvision.org/installation.html">caffe's document</a>.
 Make sure the pycaffe has been added into the PYTHONPATH.
@@ -30,7 +30,7 @@ For example, to extract the data from pre-trained caffe Alex model to binary fil
 
 The script has been tested under Python2.7.
 
-@subsection result  What is the expected output from the script
+@subsection caffe_result  What is the expected output from the script
 
 If the script runs successfully, it prints the names and shapes of each layer onto the standard
 output and generates *.npy files containing the weights and biases of each layer.
@@ -60,7 +60,7 @@ when dealing with binary files with version < 0.11, pass the whole file name {mo
 specified otherwise by the user. Thus should a user alter this default behavior and/or want to extract parameters from other
 collections, tf.GraphKeys.TRAINABLE_VARIABLES should be replaced accordingly.
 
-@subsection how_to How to use the script
+@subsection tensorflow_how_to How to use the script
 
 Install tensorflow and numpy.
 
@@ -82,7 +82,7 @@ Or for binary checkpoint files before Tensorflow 0.11:
 
 The script has been tested with Tensorflow 1.2, 1.3 on Python 2.7.6 and Python 3.4.3.
 
-@subsection result What is the expected output from the script
+@subsection tensorflow_result What is the expected output from the script
 
 If the script runs successfully, it prints the names and shapes of each parameter onto the standard output and generates
  *.npy files containing the weights and biases of each layer.
diff --git a/scripts/include_functions_kernels.py b/scripts/include_functions_kernels.py
index ab60343c4d..e6e5f5e7d5 100755
--- a/scripts/include_functions_kernels.py
+++ b/scripts/include_functions_kernels.py
@@ -45,12 +45,15 @@ def create_include_list(folder):
     return updated_files
 
 
-def include_components(path, header_prefix, folder):
+def include_components(path, header_prefix, folder, subfolders=None):
     for t in targets:
         target_path = path +  t.name + "/"
         components_file = target_path + t.prefix + header_prefix
         if os.path.exists(components_file):
             include_list = create_include_list(target_path + folder)
+            for s in subfolders or []:
+                include_list += create_include_list( target_path + folder + "/" + s)
+            include_list.sort()
             lines = read_file(components_file)
             lines, first_pos = remove_existing_includes(lines)
             lines = add_updated_includes(lines, first_pos, include_list)
@@ -59,7 +62,7 @@ def include_components(path, header_prefix, folder):
 
 if __name__ == "__main__":
     # Include kernels
-    include_components(core_path, "Kernels.h", "kernels")
+    include_components(core_path, "Kernels.h", "kernels", ["arm32", "arm64"])
 
     # Include functions
     include_components(runtime_path, "Functions.h", "functions")